[llvm] [SDAG] Heed enableAggressiveFMAFusion when folding fma(a,b,c*d)+e (PR #94209)

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Tue Jun 11 00:17:46 PDT 2024


https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/94209

>From 41c4da1224f7f48c4de8421b4f12d516fe1e056c Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Fri, 31 May 2024 17:21:25 +0100
Subject: [PATCH 1/2] [SDAG] Heed enableAggressiveFMAFusion when folding
 fma(a,b,c*d)+e

enableAggressiveFMAFusion is supposed to enable folding to fma even if
the intermediate nodes have multiple uses. Implement this for the fold:
  fma(a,b,c*d)+e -> fma(a,b,fma(c,d,e))
and its generalization to longer chains of fmas.

Since the intermediate nodes can have multiple uses we can no longer
modify them in-place, so build a new chain of fmas instead.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  36 +-
 .../test/CodeGen/AArch64/storepairsuppress.ll | 524 +++++++++---------
 .../CodeGen/AMDGPU/dagcombine-fma-crash.ll    |  24 +-
 .../CodeGen/AMDGPU/dagcombine-fma-fmad.ll     |  32 +-
 .../CodeGen/AMDGPU/fadd-fma-fmul-combine.ll   |  84 +--
 llvm/test/CodeGen/PowerPC/fma-precision.ll    |  16 +-
 6 files changed, 361 insertions(+), 355 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 5148b7258257f..d87f9a9a2977c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15881,29 +15881,29 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   bool CanReassociate =
       Options.UnsafeFPMath || N->getFlags().hasAllowReassociation();
   if (CanReassociate) {
-    SDValue FMA, E;
-    if (isFusedOp(N0) && N0.hasOneUse()) {
-      FMA = N0;
+    SDValue Tmp, E;
+    if (isFusedOp(N0) && (Aggressive || N0.hasOneUse())) {
+      Tmp = N0;
       E = N1;
-    } else if (isFusedOp(N1) && N1.hasOneUse()) {
-      FMA = N1;
+    } else if (isFusedOp(N1) && (Aggressive || N1.hasOneUse())) {
+      Tmp = N1;
       E = N0;
     }
 
-    SDValue TmpFMA = FMA;
-    while (E && isFusedOp(TmpFMA) && TmpFMA.hasOneUse()) {
-      SDValue FMul = TmpFMA->getOperand(2);
-      if (matcher.match(FMul, ISD::FMUL) && FMul.hasOneUse()) {
-        SDValue C = FMul.getOperand(0);
-        SDValue D = FMul.getOperand(1);
-        SDValue CDE = matcher.getNode(PreferredFusedOpcode, SL, VT, C, D, E);
-        DAG.ReplaceAllUsesOfValueWith(FMul, CDE);
-        // Replacing the inner FMul could cause the outer FMA to be simplified
-        // away.
-        return FMA.getOpcode() == ISD::DELETED_NODE ? SDValue(N, 0) : FMA;
+    if (Tmp) {
+      SmallVector<SDNode *> FMAs;
+      do {
+        FMAs.push_back(Tmp.getNode());
+        Tmp = Tmp->getOperand(2);
+      } while (isFusedOp(Tmp) && (Aggressive || Tmp.hasOneUse()));
+      if (matcher.match(Tmp, ISD::FMUL) && (Aggressive || Tmp.hasOneUse())) {
+        Tmp = matcher.getNode(PreferredFusedOpcode, SL, VT, Tmp.getOperand(0),
+                              Tmp.getOperand(1), E);
+        for (SDNode *FMA : reverse(FMAs))
+          Tmp = matcher.getNode(PreferredFusedOpcode, SL, VT,
+                                FMA->getOperand(0), FMA->getOperand(1), Tmp);
+        return Tmp;
       }
-
-      TmpFMA = TmpFMA->getOperand(2);
     }
   }
 
diff --git a/llvm/test/CodeGen/AArch64/storepairsuppress.ll b/llvm/test/CodeGen/AArch64/storepairsuppress.ll
index 0571bbc278a6f..c2dc73b54a057 100644
--- a/llvm/test/CodeGen/AArch64/storepairsuppress.ll
+++ b/llvm/test/CodeGen/AArch64/storepairsuppress.ll
@@ -39,165 +39,165 @@ define void @load_store_units_critical(ptr %arg, ptr noundef %arg1, i64 noundef
 ; SUPPRESS-NEXT:    ldp s2, s3, [x8]
 ; SUPPRESS-NEXT:    ldp s4, s5, [x8, #8]
 ; SUPPRESS-NEXT:    fmul s6, s5, s1
-; SUPPRESS-NEXT:    fmul s1, s4, s1
-; SUPPRESS-NEXT:    fnmsub s4, s4, s0, s6
-; SUPPRESS-NEXT:    fmadd s0, s5, s0, s1
-; SUPPRESS-NEXT:    fadd s1, s4, s2
-; SUPPRESS-NEXT:    fadd s5, s0, s3
-; SUPPRESS-NEXT:    stp s1, s5, [x8]
-; SUPPRESS-NEXT:    fsub s2, s2, s4
-; SUPPRESS-NEXT:    fsub s0, s3, s0
+; SUPPRESS-NEXT:    fnmsub s6, s4, s0, s6
+; SUPPRESS-NEXT:    fadd s7, s6, s2
+; SUPPRESS-NEXT:    fmadd s16, s4, s1, s3
+; SUPPRESS-NEXT:    fmadd s16, s5, s0, s16
+; SUPPRESS-NEXT:    stp s7, s16, [x8]
+; SUPPRESS-NEXT:    fsub s2, s2, s6
+; SUPPRESS-NEXT:    fmsub s1, s4, s1, s3
+; SUPPRESS-NEXT:    fmsub s0, s5, s0, s1
 ; SUPPRESS-NEXT:    stp s2, s0, [x8, #8]
 ; SUPPRESS-NEXT:    ldr x9, [x0, #8]
-; SUPPRESS-NEXT:    ldp s3, s4, [x9]
-; SUPPRESS-NEXT:    ldp s6, s7, [x8, #16]
-; SUPPRESS-NEXT:    ldp s16, s17, [x8, #24]
-; SUPPRESS-NEXT:    fmul s18, s17, s4
-; SUPPRESS-NEXT:    fmul s4, s16, s4
-; SUPPRESS-NEXT:    fnmsub s16, s16, s3, s18
-; SUPPRESS-NEXT:    fmadd s3, s17, s3, s4
-; SUPPRESS-NEXT:    fadd s4, s16, s6
-; SUPPRESS-NEXT:    fadd s17, s3, s7
-; SUPPRESS-NEXT:    stp s4, s17, [x8, #16]
-; SUPPRESS-NEXT:    fsub s6, s6, s16
-; SUPPRESS-NEXT:    fsub s3, s7, s3
-; SUPPRESS-NEXT:    stp s6, s3, [x8, #24]
+; SUPPRESS-NEXT:    ldp s1, s3, [x9]
+; SUPPRESS-NEXT:    ldp s4, s5, [x8, #16]
+; SUPPRESS-NEXT:    ldp s6, s17, [x8, #24]
+; SUPPRESS-NEXT:    fmul s18, s17, s3
+; SUPPRESS-NEXT:    fnmsub s18, s6, s1, s18
+; SUPPRESS-NEXT:    fadd s19, s18, s4
+; SUPPRESS-NEXT:    fmadd s20, s6, s3, s5
+; SUPPRESS-NEXT:    fmadd s20, s17, s1, s20
+; SUPPRESS-NEXT:    stp s19, s20, [x8, #16]
+; SUPPRESS-NEXT:    fsub s4, s4, s18
+; SUPPRESS-NEXT:    fmsub s3, s6, s3, s5
+; SUPPRESS-NEXT:    fmsub s1, s17, s1, s3
+; SUPPRESS-NEXT:    stp s4, s1, [x8, #24]
 ; SUPPRESS-NEXT:    ldr x9, [x0, #8]
-; SUPPRESS-NEXT:    ldp s7, s16, [x9]
-; SUPPRESS-NEXT:    fmul s18, s16, s17
-; SUPPRESS-NEXT:    fmul s17, s7, s17
-; SUPPRESS-NEXT:    fnmsub s7, s7, s4, s18
-; SUPPRESS-NEXT:    fmadd s4, s16, s4, s17
-; SUPPRESS-NEXT:    fadd s16, s7, s1
-; SUPPRESS-NEXT:    fadd s17, s4, s5
-; SUPPRESS-NEXT:    stp s16, s17, [x8]
-; SUPPRESS-NEXT:    fsub s1, s1, s7
-; SUPPRESS-NEXT:    fsub s4, s5, s4
-; SUPPRESS-NEXT:    stp s1, s4, [x8, #16]
+; SUPPRESS-NEXT:    ldp s3, s5, [x9]
+; SUPPRESS-NEXT:    fmul s6, s5, s20
+; SUPPRESS-NEXT:    fnmsub s6, s3, s19, s6
+; SUPPRESS-NEXT:    fadd s17, s6, s7
+; SUPPRESS-NEXT:    fmadd s18, s3, s20, s16
+; SUPPRESS-NEXT:    fmadd s18, s5, s19, s18
+; SUPPRESS-NEXT:    stp s17, s18, [x8]
+; SUPPRESS-NEXT:    fsub s6, s7, s6
+; SUPPRESS-NEXT:    fmsub s3, s3, s20, s16
+; SUPPRESS-NEXT:    fmsub s3, s5, s19, s3
+; SUPPRESS-NEXT:    stp s6, s3, [x8, #16]
 ; SUPPRESS-NEXT:    ldr x10, [x0, #8]
 ; SUPPRESS-NEXT:    lsl x9, x3, #4
 ; SUPPRESS-NEXT:    add x10, x10, x9
-; SUPPRESS-NEXT:    ldp s1, s4, [x10]
-; SUPPRESS-NEXT:    fmul s5, s4, s3
-; SUPPRESS-NEXT:    fmul s3, s1, s3
-; SUPPRESS-NEXT:    fnmsub s1, s1, s6, s5
-; SUPPRESS-NEXT:    fmadd s3, s4, s6, s3
-; SUPPRESS-NEXT:    fadd s4, s1, s2
-; SUPPRESS-NEXT:    fadd s5, s3, s0
-; SUPPRESS-NEXT:    stp s4, s5, [x8, #8]
-; SUPPRESS-NEXT:    fsub s1, s2, s1
-; SUPPRESS-NEXT:    fsub s0, s0, s3
-; SUPPRESS-NEXT:    stp s1, s0, [x8, #24]
+; SUPPRESS-NEXT:    ldp s3, s5, [x10]
+; SUPPRESS-NEXT:    fmul s6, s5, s1
+; SUPPRESS-NEXT:    fnmsub s6, s3, s4, s6
+; SUPPRESS-NEXT:    fadd s7, s6, s2
+; SUPPRESS-NEXT:    fmadd s16, s3, s1, s0
+; SUPPRESS-NEXT:    fmadd s16, s5, s4, s16
+; SUPPRESS-NEXT:    stp s7, s16, [x8, #8]
+; SUPPRESS-NEXT:    fsub s2, s2, s6
+; SUPPRESS-NEXT:    fmsub s0, s3, s1, s0
+; SUPPRESS-NEXT:    fmsub s0, s5, s4, s0
+; SUPPRESS-NEXT:    stp s2, s0, [x8, #24]
 ; SUPPRESS-NEXT:    ldr x10, [x0, #8]
 ; SUPPRESS-NEXT:    ldp s0, s1, [x10]
 ; SUPPRESS-NEXT:    ldp s2, s3, [x8, #32]
 ; SUPPRESS-NEXT:    ldp s4, s5, [x8, #40]
 ; SUPPRESS-NEXT:    fmul s6, s5, s1
-; SUPPRESS-NEXT:    fmul s1, s4, s1
-; SUPPRESS-NEXT:    fnmsub s4, s4, s0, s6
-; SUPPRESS-NEXT:    fmadd s0, s5, s0, s1
-; SUPPRESS-NEXT:    fadd s1, s4, s2
-; SUPPRESS-NEXT:    fadd s5, s0, s3
-; SUPPRESS-NEXT:    stp s1, s5, [x8, #32]
-; SUPPRESS-NEXT:    fsub s2, s2, s4
-; SUPPRESS-NEXT:    fsub s3, s3, s0
-; SUPPRESS-NEXT:    stp s2, s3, [x8, #40]
+; SUPPRESS-NEXT:    fnmsub s6, s4, s0, s6
+; SUPPRESS-NEXT:    fadd s7, s6, s2
+; SUPPRESS-NEXT:    fmadd s16, s4, s1, s3
+; SUPPRESS-NEXT:    fmadd s16, s5, s0, s16
+; SUPPRESS-NEXT:    stp s7, s16, [x8, #32]
+; SUPPRESS-NEXT:    fsub s6, s2, s6
+; SUPPRESS-NEXT:    fmsub s1, s4, s1, s3
+; SUPPRESS-NEXT:    fmsub s1, s5, s0, s1
+; SUPPRESS-NEXT:    stp s6, s1, [x8, #40]
 ; SUPPRESS-NEXT:    ldr x10, [x0, #8]
-; SUPPRESS-NEXT:    ldp s0, s4, [x10]
-; SUPPRESS-NEXT:    ldp s6, s7, [x8, #48]
-; SUPPRESS-NEXT:    ldp s16, s17, [x8, #56]
-; SUPPRESS-NEXT:    fmul s18, s17, s4
-; SUPPRESS-NEXT:    fmul s4, s16, s4
-; SUPPRESS-NEXT:    fnmsub s16, s16, s0, s18
-; SUPPRESS-NEXT:    fmadd s0, s17, s0, s4
-; SUPPRESS-NEXT:    fadd s4, s16, s6
-; SUPPRESS-NEXT:    fadd s17, s0, s7
-; SUPPRESS-NEXT:    stp s4, s17, [x8, #48]
-; SUPPRESS-NEXT:    fsub s6, s6, s16
-; SUPPRESS-NEXT:    fsub s0, s7, s0
-; SUPPRESS-NEXT:    stp s6, s0, [x8, #56]
+; SUPPRESS-NEXT:    ldp s0, s2, [x10]
+; SUPPRESS-NEXT:    ldp s3, s4, [x8, #48]
+; SUPPRESS-NEXT:    ldp s5, s17, [x8, #56]
+; SUPPRESS-NEXT:    fmul s18, s17, s2
+; SUPPRESS-NEXT:    fnmsub s18, s5, s0, s18
+; SUPPRESS-NEXT:    fadd s19, s18, s3
+; SUPPRESS-NEXT:    fmadd s20, s5, s2, s4
+; SUPPRESS-NEXT:    fmadd s20, s17, s0, s20
+; SUPPRESS-NEXT:    stp s19, s20, [x8, #48]
+; SUPPRESS-NEXT:    fsub s18, s3, s18
+; SUPPRESS-NEXT:    fmsub s2, s5, s2, s4
+; SUPPRESS-NEXT:    fmsub s4, s17, s0, s2
+; SUPPRESS-NEXT:    stp s18, s4, [x8, #56]
 ; SUPPRESS-NEXT:    ldr x10, [x0, #8]
-; SUPPRESS-NEXT:    ldp s7, s16, [x10]
-; SUPPRESS-NEXT:    fmul s18, s16, s17
-; SUPPRESS-NEXT:    fmul s17, s7, s17
-; SUPPRESS-NEXT:    fnmsub s7, s7, s4, s18
-; SUPPRESS-NEXT:    fmadd s4, s16, s4, s17
-; SUPPRESS-NEXT:    fadd s16, s7, s1
-; SUPPRESS-NEXT:    fadd s17, s4, s5
-; SUPPRESS-NEXT:    stp s16, s17, [x8, #32]
-; SUPPRESS-NEXT:    fsub s7, s1, s7
-; SUPPRESS-NEXT:    fsub s4, s5, s4
-; SUPPRESS-NEXT:    stp s7, s4, [x8, #48]
+; SUPPRESS-NEXT:    ldp s0, s3, [x10]
+; SUPPRESS-NEXT:    fmul s2, s3, s20
+; SUPPRESS-NEXT:    fnmsub s2, s0, s19, s2
+; SUPPRESS-NEXT:    fadd s5, s2, s7
+; SUPPRESS-NEXT:    fmadd s17, s0, s20, s16
+; SUPPRESS-NEXT:    fmadd s17, s3, s19, s17
+; SUPPRESS-NEXT:    stp s5, s17, [x8, #32]
+; SUPPRESS-NEXT:    fsub s2, s7, s2
+; SUPPRESS-NEXT:    fmsub s0, s0, s20, s16
+; SUPPRESS-NEXT:    fmsub s3, s3, s19, s0
+; SUPPRESS-NEXT:    stp s2, s3, [x8, #48]
 ; SUPPRESS-NEXT:    ldr x10, [x0, #8]
 ; SUPPRESS-NEXT:    add x9, x10, x9
-; SUPPRESS-NEXT:    ldp s1, s5, [x9]
-; SUPPRESS-NEXT:    fmul s18, s5, s0
-; SUPPRESS-NEXT:    fmul s0, s1, s0
-; SUPPRESS-NEXT:    fnmsub s1, s1, s6, s18
-; SUPPRESS-NEXT:    fmadd s5, s5, s6, s0
-; SUPPRESS-NEXT:    fadd s6, s1, s2
-; SUPPRESS-NEXT:    fadd s18, s5, s3
-; SUPPRESS-NEXT:    stp s6, s18, [x8, #40]
-; SUPPRESS-NEXT:    fsub s0, s2, s1
-; SUPPRESS-NEXT:    fsub s1, s3, s5
+; SUPPRESS-NEXT:    ldp s7, s16, [x9]
+; SUPPRESS-NEXT:    fmul s0, s16, s4
+; SUPPRESS-NEXT:    fnmsub s0, s7, s18, s0
+; SUPPRESS-NEXT:    fadd s19, s0, s6
+; SUPPRESS-NEXT:    fmadd s20, s7, s4, s1
+; SUPPRESS-NEXT:    fmadd s20, s16, s18, s20
+; SUPPRESS-NEXT:    stp s19, s20, [x8, #40]
+; SUPPRESS-NEXT:    fsub s0, s6, s0
+; SUPPRESS-NEXT:    fmsub s1, s7, s4, s1
+; SUPPRESS-NEXT:    fmsub s1, s16, s18, s1
 ; SUPPRESS-NEXT:    stp s0, s1, [x8, #56]
 ; SUPPRESS-NEXT:    ldr x9, [x0, #8]
-; SUPPRESS-NEXT:    ldp s2, s3, [x9]
-; SUPPRESS-NEXT:    ldp s5, s19, [x8]
-; SUPPRESS-NEXT:    fmul s20, s17, s3
-; SUPPRESS-NEXT:    fmul s3, s16, s3
-; SUPPRESS-NEXT:    fnmsub s16, s16, s2, s20
-; SUPPRESS-NEXT:    fmadd s2, s17, s2, s3
-; SUPPRESS-NEXT:    fadd s3, s16, s5
-; SUPPRESS-NEXT:    fadd s17, s2, s19
-; SUPPRESS-NEXT:    stp s3, s17, [x8]
-; SUPPRESS-NEXT:    fsub s3, s5, s16
-; SUPPRESS-NEXT:    fsub s2, s19, s2
-; SUPPRESS-NEXT:    stp s3, s2, [x8, #32]
+; SUPPRESS-NEXT:    ldp s4, s6, [x9]
+; SUPPRESS-NEXT:    ldp s7, s16, [x8]
+; SUPPRESS-NEXT:    fmul s18, s17, s6
+; SUPPRESS-NEXT:    fnmsub s18, s5, s4, s18
+; SUPPRESS-NEXT:    fadd s21, s18, s7
+; SUPPRESS-NEXT:    fmadd s22, s5, s6, s16
+; SUPPRESS-NEXT:    fmadd s22, s17, s4, s22
+; SUPPRESS-NEXT:    stp s21, s22, [x8]
+; SUPPRESS-NEXT:    fsub s7, s7, s18
+; SUPPRESS-NEXT:    fmsub s5, s5, s6, s16
+; SUPPRESS-NEXT:    fmsub s4, s17, s4, s5
+; SUPPRESS-NEXT:    stp s7, s4, [x8, #32]
 ; SUPPRESS-NEXT:    ldr x9, [x0, #8]
 ; SUPPRESS-NEXT:    add x9, x9, w3, sxtw #3
-; SUPPRESS-NEXT:    ldp s2, s3, [x9]
-; SUPPRESS-NEXT:    ldp s5, s16, [x8, #8]
-; SUPPRESS-NEXT:    fmul s17, s18, s3
-; SUPPRESS-NEXT:    fmul s3, s6, s3
-; SUPPRESS-NEXT:    fnmsub s6, s6, s2, s17
-; SUPPRESS-NEXT:    fmadd s2, s18, s2, s3
-; SUPPRESS-NEXT:    fadd s3, s6, s5
-; SUPPRESS-NEXT:    fadd s17, s2, s16
-; SUPPRESS-NEXT:    stp s3, s17, [x8, #8]
-; SUPPRESS-NEXT:    fsub s3, s5, s6
-; SUPPRESS-NEXT:    fsub s2, s16, s2
-; SUPPRESS-NEXT:    stp s3, s2, [x8, #40]
+; SUPPRESS-NEXT:    ldp s4, s5, [x9]
+; SUPPRESS-NEXT:    ldp s6, s7, [x8, #8]
+; SUPPRESS-NEXT:    fmul s16, s20, s5
+; SUPPRESS-NEXT:    fnmsub s16, s19, s4, s16
+; SUPPRESS-NEXT:    fadd s17, s16, s6
+; SUPPRESS-NEXT:    fmadd s18, s19, s5, s7
+; SUPPRESS-NEXT:    fmadd s18, s20, s4, s18
+; SUPPRESS-NEXT:    stp s17, s18, [x8, #8]
+; SUPPRESS-NEXT:    fsub s6, s6, s16
+; SUPPRESS-NEXT:    fmsub s5, s19, s5, s7
+; SUPPRESS-NEXT:    fmsub s4, s20, s4, s5
+; SUPPRESS-NEXT:    stp s6, s4, [x8, #40]
 ; SUPPRESS-NEXT:    lsl x9, x3, #33
 ; SUPPRESS-NEXT:    ldr x10, [x0, #8]
 ; SUPPRESS-NEXT:    add x9, x10, x9, asr #29
-; SUPPRESS-NEXT:    ldp s2, s3, [x9]
-; SUPPRESS-NEXT:    ldp s5, s6, [x8, #16]
-; SUPPRESS-NEXT:    fmul s16, s4, s3
-; SUPPRESS-NEXT:    fmul s3, s7, s3
-; SUPPRESS-NEXT:    fnmsub s7, s7, s2, s16
-; SUPPRESS-NEXT:    fmadd s2, s4, s2, s3
-; SUPPRESS-NEXT:    fadd s3, s7, s5
-; SUPPRESS-NEXT:    fadd s4, s2, s6
-; SUPPRESS-NEXT:    stp s3, s4, [x8, #16]
-; SUPPRESS-NEXT:    fsub s3, s5, s7
-; SUPPRESS-NEXT:    fsub s2, s6, s2
-; SUPPRESS-NEXT:    stp s3, s2, [x8, #48]
+; SUPPRESS-NEXT:    ldp s4, s5, [x9]
+; SUPPRESS-NEXT:    ldp s6, s7, [x8, #16]
+; SUPPRESS-NEXT:    fmul s16, s3, s5
+; SUPPRESS-NEXT:    fnmsub s16, s2, s4, s16
+; SUPPRESS-NEXT:    fadd s17, s16, s6
+; SUPPRESS-NEXT:    fmadd s18, s2, s5, s7
+; SUPPRESS-NEXT:    fmadd s18, s3, s4, s18
+; SUPPRESS-NEXT:    stp s17, s18, [x8, #16]
+; SUPPRESS-NEXT:    fsub s6, s6, s16
+; SUPPRESS-NEXT:    fmsub s2, s2, s5, s7
+; SUPPRESS-NEXT:    fmsub s2, s3, s4, s2
+; SUPPRESS-NEXT:    stp s6, s2, [x8, #48]
 ; SUPPRESS-NEXT:    add w9, w3, w3, lsl #1
 ; SUPPRESS-NEXT:    ldr x10, [x0, #8]
 ; SUPPRESS-NEXT:    add x9, x10, w9, sxtw #3
 ; SUPPRESS-NEXT:    ldp s2, s3, [x9]
 ; SUPPRESS-NEXT:    ldp s4, s5, [x8, #24]
 ; SUPPRESS-NEXT:    fmul s6, s1, s3
-; SUPPRESS-NEXT:    fmul s3, s0, s3
-; SUPPRESS-NEXT:    fnmsub s0, s0, s2, s6
-; SUPPRESS-NEXT:    fmadd s1, s1, s2, s3
-; SUPPRESS-NEXT:    fadd s2, s0, s4
-; SUPPRESS-NEXT:    fadd s3, s1, s5
-; SUPPRESS-NEXT:    stp s2, s3, [x8, #24]
-; SUPPRESS-NEXT:    fsub s0, s4, s0
-; SUPPRESS-NEXT:    fsub s1, s5, s1
-; SUPPRESS-NEXT:    stp s0, s1, [x8, #56]
+; SUPPRESS-NEXT:    fnmsub s6, s0, s2, s6
+; SUPPRESS-NEXT:    fadd s7, s6, s4
+; SUPPRESS-NEXT:    fmadd s16, s0, s3, s5
+; SUPPRESS-NEXT:    fmadd s16, s1, s2, s16
+; SUPPRESS-NEXT:    stp s7, s16, [x8, #24]
+; SUPPRESS-NEXT:    fsub s4, s4, s6
+; SUPPRESS-NEXT:    fmsub s0, s0, s3, s5
+; SUPPRESS-NEXT:    fmsub s0, s1, s2, s0
+; SUPPRESS-NEXT:    stp s4, s0, [x8, #56]
 ; SUPPRESS-NEXT:    ret
 ;
 ; NOSUPPRESS-LABEL: load_store_units_critical:
@@ -208,165 +208,165 @@ define void @load_store_units_critical(ptr %arg, ptr noundef %arg1, i64 noundef
 ; NOSUPPRESS-NEXT:    ldp s2, s3, [x8]
 ; NOSUPPRESS-NEXT:    ldp s4, s5, [x8, #8]
 ; NOSUPPRESS-NEXT:    fmul s6, s5, s1
-; NOSUPPRESS-NEXT:    fmul s1, s4, s1
-; NOSUPPRESS-NEXT:    fnmsub s4, s4, s0, s6
-; NOSUPPRESS-NEXT:    fmadd s0, s5, s0, s1
-; NOSUPPRESS-NEXT:    fadd s1, s4, s2
-; NOSUPPRESS-NEXT:    fadd s5, s0, s3
-; NOSUPPRESS-NEXT:    stp s1, s5, [x8]
-; NOSUPPRESS-NEXT:    fsub s2, s2, s4
-; NOSUPPRESS-NEXT:    fsub s0, s3, s0
+; NOSUPPRESS-NEXT:    fnmsub s6, s4, s0, s6
+; NOSUPPRESS-NEXT:    fadd s7, s6, s2
+; NOSUPPRESS-NEXT:    fmadd s16, s4, s1, s3
+; NOSUPPRESS-NEXT:    fmadd s16, s5, s0, s16
+; NOSUPPRESS-NEXT:    stp s7, s16, [x8]
+; NOSUPPRESS-NEXT:    fsub s2, s2, s6
+; NOSUPPRESS-NEXT:    fmsub s1, s4, s1, s3
+; NOSUPPRESS-NEXT:    fmsub s0, s5, s0, s1
 ; NOSUPPRESS-NEXT:    stp s2, s0, [x8, #8]
 ; NOSUPPRESS-NEXT:    ldr x9, [x0, #8]
-; NOSUPPRESS-NEXT:    ldp s3, s4, [x9]
-; NOSUPPRESS-NEXT:    ldp s6, s7, [x8, #16]
-; NOSUPPRESS-NEXT:    ldp s16, s17, [x8, #24]
-; NOSUPPRESS-NEXT:    fmul s18, s17, s4
-; NOSUPPRESS-NEXT:    fmul s4, s16, s4
-; NOSUPPRESS-NEXT:    fnmsub s16, s16, s3, s18
-; NOSUPPRESS-NEXT:    fmadd s3, s17, s3, s4
-; NOSUPPRESS-NEXT:    fadd s4, s16, s6
-; NOSUPPRESS-NEXT:    fadd s17, s3, s7
-; NOSUPPRESS-NEXT:    stp s4, s17, [x8, #16]
-; NOSUPPRESS-NEXT:    fsub s6, s6, s16
-; NOSUPPRESS-NEXT:    fsub s3, s7, s3
-; NOSUPPRESS-NEXT:    stp s6, s3, [x8, #24]
+; NOSUPPRESS-NEXT:    ldp s1, s3, [x9]
+; NOSUPPRESS-NEXT:    ldp s4, s5, [x8, #16]
+; NOSUPPRESS-NEXT:    ldp s6, s17, [x8, #24]
+; NOSUPPRESS-NEXT:    fmul s18, s17, s3
+; NOSUPPRESS-NEXT:    fnmsub s18, s6, s1, s18
+; NOSUPPRESS-NEXT:    fadd s19, s18, s4
+; NOSUPPRESS-NEXT:    fmadd s20, s6, s3, s5
+; NOSUPPRESS-NEXT:    fmadd s20, s17, s1, s20
+; NOSUPPRESS-NEXT:    stp s19, s20, [x8, #16]
+; NOSUPPRESS-NEXT:    fsub s4, s4, s18
+; NOSUPPRESS-NEXT:    fmsub s3, s6, s3, s5
+; NOSUPPRESS-NEXT:    fmsub s1, s17, s1, s3
+; NOSUPPRESS-NEXT:    stp s4, s1, [x8, #24]
 ; NOSUPPRESS-NEXT:    ldr x9, [x0, #8]
-; NOSUPPRESS-NEXT:    ldp s7, s16, [x9]
-; NOSUPPRESS-NEXT:    fmul s18, s16, s17
-; NOSUPPRESS-NEXT:    fmul s17, s7, s17
-; NOSUPPRESS-NEXT:    fnmsub s7, s7, s4, s18
-; NOSUPPRESS-NEXT:    fmadd s4, s16, s4, s17
-; NOSUPPRESS-NEXT:    fadd s16, s7, s1
-; NOSUPPRESS-NEXT:    fadd s17, s4, s5
-; NOSUPPRESS-NEXT:    stp s16, s17, [x8]
-; NOSUPPRESS-NEXT:    fsub s1, s1, s7
-; NOSUPPRESS-NEXT:    fsub s4, s5, s4
-; NOSUPPRESS-NEXT:    stp s1, s4, [x8, #16]
+; NOSUPPRESS-NEXT:    ldp s3, s5, [x9]
+; NOSUPPRESS-NEXT:    fmul s6, s5, s20
+; NOSUPPRESS-NEXT:    fnmsub s6, s3, s19, s6
+; NOSUPPRESS-NEXT:    fadd s17, s6, s7
+; NOSUPPRESS-NEXT:    fmadd s18, s3, s20, s16
+; NOSUPPRESS-NEXT:    fmadd s18, s5, s19, s18
+; NOSUPPRESS-NEXT:    stp s17, s18, [x8]
+; NOSUPPRESS-NEXT:    fsub s6, s7, s6
+; NOSUPPRESS-NEXT:    fmsub s3, s3, s20, s16
+; NOSUPPRESS-NEXT:    fmsub s3, s5, s19, s3
+; NOSUPPRESS-NEXT:    stp s6, s3, [x8, #16]
 ; NOSUPPRESS-NEXT:    ldr x10, [x0, #8]
 ; NOSUPPRESS-NEXT:    lsl x9, x3, #4
 ; NOSUPPRESS-NEXT:    add x10, x10, x9
-; NOSUPPRESS-NEXT:    ldp s1, s4, [x10]
-; NOSUPPRESS-NEXT:    fmul s5, s4, s3
-; NOSUPPRESS-NEXT:    fmul s3, s1, s3
-; NOSUPPRESS-NEXT:    fnmsub s1, s1, s6, s5
-; NOSUPPRESS-NEXT:    fmadd s3, s4, s6, s3
-; NOSUPPRESS-NEXT:    fadd s4, s1, s2
-; NOSUPPRESS-NEXT:    fadd s5, s3, s0
-; NOSUPPRESS-NEXT:    stp s4, s5, [x8, #8]
-; NOSUPPRESS-NEXT:    fsub s1, s2, s1
-; NOSUPPRESS-NEXT:    fsub s0, s0, s3
-; NOSUPPRESS-NEXT:    stp s1, s0, [x8, #24]
+; NOSUPPRESS-NEXT:    ldp s3, s5, [x10]
+; NOSUPPRESS-NEXT:    fmul s6, s5, s1
+; NOSUPPRESS-NEXT:    fnmsub s6, s3, s4, s6
+; NOSUPPRESS-NEXT:    fadd s7, s6, s2
+; NOSUPPRESS-NEXT:    fmadd s16, s3, s1, s0
+; NOSUPPRESS-NEXT:    fmadd s16, s5, s4, s16
+; NOSUPPRESS-NEXT:    stp s7, s16, [x8, #8]
+; NOSUPPRESS-NEXT:    fsub s2, s2, s6
+; NOSUPPRESS-NEXT:    fmsub s0, s3, s1, s0
+; NOSUPPRESS-NEXT:    fmsub s0, s5, s4, s0
+; NOSUPPRESS-NEXT:    stp s2, s0, [x8, #24]
 ; NOSUPPRESS-NEXT:    ldr x10, [x0, #8]
 ; NOSUPPRESS-NEXT:    ldp s0, s1, [x10]
 ; NOSUPPRESS-NEXT:    ldp s2, s3, [x8, #32]
 ; NOSUPPRESS-NEXT:    ldp s4, s5, [x8, #40]
 ; NOSUPPRESS-NEXT:    fmul s6, s5, s1
-; NOSUPPRESS-NEXT:    fmul s1, s4, s1
-; NOSUPPRESS-NEXT:    fnmsub s4, s4, s0, s6
-; NOSUPPRESS-NEXT:    fmadd s0, s5, s0, s1
-; NOSUPPRESS-NEXT:    fadd s1, s4, s2
-; NOSUPPRESS-NEXT:    fadd s5, s0, s3
-; NOSUPPRESS-NEXT:    stp s1, s5, [x8, #32]
-; NOSUPPRESS-NEXT:    fsub s2, s2, s4
-; NOSUPPRESS-NEXT:    fsub s3, s3, s0
-; NOSUPPRESS-NEXT:    stp s2, s3, [x8, #40]
+; NOSUPPRESS-NEXT:    fnmsub s6, s4, s0, s6
+; NOSUPPRESS-NEXT:    fadd s7, s6, s2
+; NOSUPPRESS-NEXT:    fmadd s16, s4, s1, s3
+; NOSUPPRESS-NEXT:    fmadd s16, s5, s0, s16
+; NOSUPPRESS-NEXT:    stp s7, s16, [x8, #32]
+; NOSUPPRESS-NEXT:    fsub s6, s2, s6
+; NOSUPPRESS-NEXT:    fmsub s1, s4, s1, s3
+; NOSUPPRESS-NEXT:    fmsub s1, s5, s0, s1
+; NOSUPPRESS-NEXT:    stp s6, s1, [x8, #40]
 ; NOSUPPRESS-NEXT:    ldr x10, [x0, #8]
-; NOSUPPRESS-NEXT:    ldp s0, s4, [x10]
-; NOSUPPRESS-NEXT:    ldp s6, s7, [x8, #48]
-; NOSUPPRESS-NEXT:    ldp s16, s17, [x8, #56]
-; NOSUPPRESS-NEXT:    fmul s18, s17, s4
-; NOSUPPRESS-NEXT:    fmul s4, s16, s4
-; NOSUPPRESS-NEXT:    fnmsub s16, s16, s0, s18
-; NOSUPPRESS-NEXT:    fmadd s0, s17, s0, s4
-; NOSUPPRESS-NEXT:    fadd s4, s16, s6
-; NOSUPPRESS-NEXT:    fadd s17, s0, s7
-; NOSUPPRESS-NEXT:    stp s4, s17, [x8, #48]
-; NOSUPPRESS-NEXT:    fsub s6, s6, s16
-; NOSUPPRESS-NEXT:    fsub s0, s7, s0
-; NOSUPPRESS-NEXT:    stp s6, s0, [x8, #56]
+; NOSUPPRESS-NEXT:    ldp s0, s2, [x10]
+; NOSUPPRESS-NEXT:    ldp s3, s4, [x8, #48]
+; NOSUPPRESS-NEXT:    ldp s5, s17, [x8, #56]
+; NOSUPPRESS-NEXT:    fmul s18, s17, s2
+; NOSUPPRESS-NEXT:    fnmsub s18, s5, s0, s18
+; NOSUPPRESS-NEXT:    fadd s19, s18, s3
+; NOSUPPRESS-NEXT:    fmadd s20, s5, s2, s4
+; NOSUPPRESS-NEXT:    fmadd s20, s17, s0, s20
+; NOSUPPRESS-NEXT:    stp s19, s20, [x8, #48]
+; NOSUPPRESS-NEXT:    fsub s18, s3, s18
+; NOSUPPRESS-NEXT:    fmsub s2, s5, s2, s4
+; NOSUPPRESS-NEXT:    fmsub s4, s17, s0, s2
+; NOSUPPRESS-NEXT:    stp s18, s4, [x8, #56]
 ; NOSUPPRESS-NEXT:    ldr x10, [x0, #8]
-; NOSUPPRESS-NEXT:    ldp s7, s16, [x10]
-; NOSUPPRESS-NEXT:    fmul s18, s16, s17
-; NOSUPPRESS-NEXT:    fmul s17, s7, s17
-; NOSUPPRESS-NEXT:    fnmsub s7, s7, s4, s18
-; NOSUPPRESS-NEXT:    fmadd s4, s16, s4, s17
-; NOSUPPRESS-NEXT:    fadd s16, s7, s1
-; NOSUPPRESS-NEXT:    fadd s17, s4, s5
-; NOSUPPRESS-NEXT:    stp s16, s17, [x8, #32]
-; NOSUPPRESS-NEXT:    fsub s7, s1, s7
-; NOSUPPRESS-NEXT:    fsub s4, s5, s4
-; NOSUPPRESS-NEXT:    stp s7, s4, [x8, #48]
+; NOSUPPRESS-NEXT:    ldp s0, s3, [x10]
+; NOSUPPRESS-NEXT:    fmul s2, s3, s20
+; NOSUPPRESS-NEXT:    fnmsub s2, s0, s19, s2
+; NOSUPPRESS-NEXT:    fadd s5, s2, s7
+; NOSUPPRESS-NEXT:    fmadd s17, s0, s20, s16
+; NOSUPPRESS-NEXT:    fmadd s17, s3, s19, s17
+; NOSUPPRESS-NEXT:    stp s5, s17, [x8, #32]
+; NOSUPPRESS-NEXT:    fsub s2, s7, s2
+; NOSUPPRESS-NEXT:    fmsub s0, s0, s20, s16
+; NOSUPPRESS-NEXT:    fmsub s3, s3, s19, s0
+; NOSUPPRESS-NEXT:    stp s2, s3, [x8, #48]
 ; NOSUPPRESS-NEXT:    ldr x10, [x0, #8]
 ; NOSUPPRESS-NEXT:    add x9, x10, x9
-; NOSUPPRESS-NEXT:    ldp s1, s5, [x9]
-; NOSUPPRESS-NEXT:    fmul s18, s5, s0
-; NOSUPPRESS-NEXT:    fmul s0, s1, s0
-; NOSUPPRESS-NEXT:    fnmsub s1, s1, s6, s18
-; NOSUPPRESS-NEXT:    fmadd s5, s5, s6, s0
-; NOSUPPRESS-NEXT:    fadd s6, s1, s2
-; NOSUPPRESS-NEXT:    fadd s18, s5, s3
-; NOSUPPRESS-NEXT:    stp s6, s18, [x8, #40]
-; NOSUPPRESS-NEXT:    fsub s0, s2, s1
-; NOSUPPRESS-NEXT:    fsub s1, s3, s5
+; NOSUPPRESS-NEXT:    ldp s7, s16, [x9]
+; NOSUPPRESS-NEXT:    fmul s0, s16, s4
+; NOSUPPRESS-NEXT:    fnmsub s0, s7, s18, s0
+; NOSUPPRESS-NEXT:    fadd s19, s0, s6
+; NOSUPPRESS-NEXT:    fmadd s20, s7, s4, s1
+; NOSUPPRESS-NEXT:    fmadd s20, s16, s18, s20
+; NOSUPPRESS-NEXT:    stp s19, s20, [x8, #40]
+; NOSUPPRESS-NEXT:    fsub s0, s6, s0
+; NOSUPPRESS-NEXT:    fmsub s1, s7, s4, s1
+; NOSUPPRESS-NEXT:    fmsub s1, s16, s18, s1
 ; NOSUPPRESS-NEXT:    stp s0, s1, [x8, #56]
 ; NOSUPPRESS-NEXT:    ldr x9, [x0, #8]
-; NOSUPPRESS-NEXT:    ldp s2, s3, [x9]
-; NOSUPPRESS-NEXT:    ldp s5, s19, [x8]
-; NOSUPPRESS-NEXT:    fmul s20, s17, s3
-; NOSUPPRESS-NEXT:    fmul s3, s16, s3
-; NOSUPPRESS-NEXT:    fnmsub s16, s16, s2, s20
-; NOSUPPRESS-NEXT:    fmadd s2, s17, s2, s3
-; NOSUPPRESS-NEXT:    fadd s3, s16, s5
-; NOSUPPRESS-NEXT:    fadd s17, s2, s19
-; NOSUPPRESS-NEXT:    stp s3, s17, [x8]
-; NOSUPPRESS-NEXT:    fsub s3, s5, s16
-; NOSUPPRESS-NEXT:    fsub s2, s19, s2
-; NOSUPPRESS-NEXT:    stp s3, s2, [x8, #32]
+; NOSUPPRESS-NEXT:    ldp s4, s6, [x9]
+; NOSUPPRESS-NEXT:    ldp s7, s16, [x8]
+; NOSUPPRESS-NEXT:    fmul s18, s17, s6
+; NOSUPPRESS-NEXT:    fnmsub s18, s5, s4, s18
+; NOSUPPRESS-NEXT:    fadd s21, s18, s7
+; NOSUPPRESS-NEXT:    fmadd s22, s5, s6, s16
+; NOSUPPRESS-NEXT:    fmadd s22, s17, s4, s22
+; NOSUPPRESS-NEXT:    stp s21, s22, [x8]
+; NOSUPPRESS-NEXT:    fsub s7, s7, s18
+; NOSUPPRESS-NEXT:    fmsub s5, s5, s6, s16
+; NOSUPPRESS-NEXT:    fmsub s4, s17, s4, s5
+; NOSUPPRESS-NEXT:    stp s7, s4, [x8, #32]
 ; NOSUPPRESS-NEXT:    ldr x9, [x0, #8]
 ; NOSUPPRESS-NEXT:    add x9, x9, w3, sxtw #3
-; NOSUPPRESS-NEXT:    ldp s2, s3, [x9]
-; NOSUPPRESS-NEXT:    ldp s5, s16, [x8, #8]
-; NOSUPPRESS-NEXT:    fmul s17, s18, s3
-; NOSUPPRESS-NEXT:    fmul s3, s6, s3
-; NOSUPPRESS-NEXT:    fnmsub s6, s6, s2, s17
-; NOSUPPRESS-NEXT:    fmadd s2, s18, s2, s3
-; NOSUPPRESS-NEXT:    fadd s3, s6, s5
-; NOSUPPRESS-NEXT:    fadd s17, s2, s16
-; NOSUPPRESS-NEXT:    stp s3, s17, [x8, #8]
-; NOSUPPRESS-NEXT:    fsub s3, s5, s6
-; NOSUPPRESS-NEXT:    fsub s2, s16, s2
-; NOSUPPRESS-NEXT:    stp s3, s2, [x8, #40]
+; NOSUPPRESS-NEXT:    ldp s4, s5, [x9]
+; NOSUPPRESS-NEXT:    ldp s6, s7, [x8, #8]
+; NOSUPPRESS-NEXT:    fmul s16, s20, s5
+; NOSUPPRESS-NEXT:    fnmsub s16, s19, s4, s16
+; NOSUPPRESS-NEXT:    fadd s17, s16, s6
+; NOSUPPRESS-NEXT:    fmadd s18, s19, s5, s7
+; NOSUPPRESS-NEXT:    fmadd s18, s20, s4, s18
+; NOSUPPRESS-NEXT:    stp s17, s18, [x8, #8]
+; NOSUPPRESS-NEXT:    fsub s6, s6, s16
+; NOSUPPRESS-NEXT:    fmsub s5, s19, s5, s7
+; NOSUPPRESS-NEXT:    fmsub s4, s20, s4, s5
+; NOSUPPRESS-NEXT:    stp s6, s4, [x8, #40]
 ; NOSUPPRESS-NEXT:    lsl x9, x3, #33
 ; NOSUPPRESS-NEXT:    ldr x10, [x0, #8]
 ; NOSUPPRESS-NEXT:    add x9, x10, x9, asr #29
-; NOSUPPRESS-NEXT:    ldp s2, s3, [x9]
-; NOSUPPRESS-NEXT:    ldp s5, s6, [x8, #16]
-; NOSUPPRESS-NEXT:    fmul s16, s4, s3
-; NOSUPPRESS-NEXT:    fmul s3, s7, s3
-; NOSUPPRESS-NEXT:    fnmsub s7, s7, s2, s16
-; NOSUPPRESS-NEXT:    fmadd s2, s4, s2, s3
-; NOSUPPRESS-NEXT:    fadd s3, s7, s5
-; NOSUPPRESS-NEXT:    fadd s4, s2, s6
-; NOSUPPRESS-NEXT:    stp s3, s4, [x8, #16]
-; NOSUPPRESS-NEXT:    fsub s3, s5, s7
-; NOSUPPRESS-NEXT:    fsub s2, s6, s2
-; NOSUPPRESS-NEXT:    stp s3, s2, [x8, #48]
+; NOSUPPRESS-NEXT:    ldp s4, s5, [x9]
+; NOSUPPRESS-NEXT:    ldp s6, s7, [x8, #16]
+; NOSUPPRESS-NEXT:    fmul s16, s3, s5
+; NOSUPPRESS-NEXT:    fnmsub s16, s2, s4, s16
+; NOSUPPRESS-NEXT:    fadd s17, s16, s6
+; NOSUPPRESS-NEXT:    fmadd s18, s2, s5, s7
+; NOSUPPRESS-NEXT:    fmadd s18, s3, s4, s18
+; NOSUPPRESS-NEXT:    stp s17, s18, [x8, #16]
+; NOSUPPRESS-NEXT:    fsub s6, s6, s16
+; NOSUPPRESS-NEXT:    fmsub s2, s2, s5, s7
+; NOSUPPRESS-NEXT:    fmsub s2, s3, s4, s2
+; NOSUPPRESS-NEXT:    stp s6, s2, [x8, #48]
 ; NOSUPPRESS-NEXT:    add w9, w3, w3, lsl #1
 ; NOSUPPRESS-NEXT:    ldr x10, [x0, #8]
 ; NOSUPPRESS-NEXT:    add x9, x10, w9, sxtw #3
 ; NOSUPPRESS-NEXT:    ldp s2, s3, [x9]
 ; NOSUPPRESS-NEXT:    ldp s4, s5, [x8, #24]
 ; NOSUPPRESS-NEXT:    fmul s6, s1, s3
-; NOSUPPRESS-NEXT:    fmul s3, s0, s3
-; NOSUPPRESS-NEXT:    fnmsub s0, s0, s2, s6
-; NOSUPPRESS-NEXT:    fmadd s1, s1, s2, s3
-; NOSUPPRESS-NEXT:    fadd s2, s0, s4
-; NOSUPPRESS-NEXT:    fadd s3, s1, s5
-; NOSUPPRESS-NEXT:    stp s2, s3, [x8, #24]
-; NOSUPPRESS-NEXT:    fsub s0, s4, s0
-; NOSUPPRESS-NEXT:    fsub s1, s5, s1
-; NOSUPPRESS-NEXT:    stp s0, s1, [x8, #56]
+; NOSUPPRESS-NEXT:    fnmsub s6, s0, s2, s6
+; NOSUPPRESS-NEXT:    fadd s7, s6, s4
+; NOSUPPRESS-NEXT:    fmadd s16, s0, s3, s5
+; NOSUPPRESS-NEXT:    fmadd s16, s1, s2, s16
+; NOSUPPRESS-NEXT:    stp s7, s16, [x8, #24]
+; NOSUPPRESS-NEXT:    fsub s4, s4, s6
+; NOSUPPRESS-NEXT:    fmsub s0, s0, s3, s5
+; NOSUPPRESS-NEXT:    fmsub s0, s1, s2, s0
+; NOSUPPRESS-NEXT:    stp s4, s0, [x8, #56]
 ; NOSUPPRESS-NEXT:    ret
 bb:
   %shl = shl i64 %arg3, 1
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-crash.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-crash.ll
index 09a1f45557608..53952c505621f 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-crash.ll
@@ -22,21 +22,21 @@ define void @main(float %arg) {
   ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
   ; CHECK-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract reassoc nofpexcept V_FMAC_F32_e64 0, [[S_MOV_B32_]], 0, [[S_MOV_B32_]], 0, [[V_MOV_B32_e32_]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract reassoc nofpexcept V_FMAC_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[V_FMAC_F32_e64_]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = contract reassoc nofpexcept V_ADD_F32_e64 0, [[V_FMAC_F32_e64_1]], 0, [[V_MOV_B32_e32_]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_FMAC_F32_e64_1]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.bb11:
   ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, [[V_FMAC_F32_e64_1]], %bb.1
-  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, [[V_ADD_F32_e64_]], %bb.1
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, [[COPY1]], %bb.1
   ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[S_MOV_B32_1]], %bb.0, [[S_MOV_B32_2]], %bb.1
   ; CHECK-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[PHI2]], implicit $exec
   ; CHECK-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[V_CNDMASK_B32_e64_]]
-  ; CHECK-NEXT:   S_CMP_LG_U32 killed [[COPY1]], killed [[S_MOV_B32_3]], implicit-def $scc
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
-  ; CHECK-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 $exec_lo, killed [[COPY2]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[V_CNDMASK_B32_e64_]]
+  ; CHECK-NEXT:   S_CMP_LG_U32 killed [[COPY2]], killed [[S_MOV_B32_3]], implicit-def $scc
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $scc
+  ; CHECK-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 $exec_lo, killed [[COPY3]], implicit-def dead $scc
   ; CHECK-NEXT:   $vcc_lo = COPY [[S_AND_B32_1]]
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.4, implicit $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.3
@@ -90,13 +90,11 @@ define float @test2(float %arg, float %arg1) {
   ; CHECK-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = nsz contract reassoc nofpexcept V_FMAC_F32_e64 0, [[COPY]], 0, killed [[S_MOV_B32_]], 0, [[V_MOV_B32_e32_]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
   ; CHECK-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = nsz contract reassoc nofpexcept V_FMAC_F32_e64 0, [[COPY1]], 0, killed [[S_MOV_B32_1]], 0, [[V_FMAC_F32_e64_]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nsz contract reassoc nofpexcept V_ADD_F32_e64 0, [[V_FMAC_F32_e64_1]], 0, [[V_MOV_B32_e32_]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_RCP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e64 0, [[V_FMAC_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_RCP_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e64 0, killed [[V_ADD_F32_e64_]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[V_RCP_F32_e64_1]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F32_e64 0, killed [[V_MUL_F32_e64_]], 0, [[V_RCP_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_MAX_F32_e64_]], 0, killed [[V_RCP_F32_e64_]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   $vgpr0 = COPY [[V_ADD_F32_e64_1]]
+  ; CHECK-NEXT:   [[V_RCP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e64 0, killed [[V_FMAC_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[V_RCP_F32_e64_]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F32_e64 0, killed [[V_MUL_F32_e64_]], 0, [[V_RCP_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_MAX_F32_e64_]], 0, [[V_RCP_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   $vgpr0 = COPY [[V_ADD_F32_e64_]]
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
 bb:
   %i = fmul contract float %arg1, 1.000000e+02
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
index 451f64f71282b..67331bdab1ee4 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
@@ -39,7 +39,7 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 {
 ; GFX10-NEXT:    v_sub_f32_e32 v8, s0, v1
 ; GFX10-NEXT:    v_fma_f32 v7, -s2, v6, s6
 ; GFX10-NEXT:    v_fma_f32 v5, v6, v5, 1.0
-; GFX10-NEXT:    v_mad_f32 v10, s2, v6, v2
+; GFX10-NEXT:    v_fma_f32 v10, s2, v6, v2
 ; GFX10-NEXT:    s_mov_b32 s0, 0x3c23d70a
 ; GFX10-NEXT:    v_fmac_f32_e32 v1, v6, v8
 ; GFX10-NEXT:    v_fmac_f32_e32 v10, v7, v6
@@ -265,8 +265,8 @@ define float @fmac_sequence_innermost_fmul(float %a, float %b, float %c, float %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_f32 v2, v2, v3, v6
-; GFX10-NEXT:    v_fmac_f32_e32 v2, v0, v1
-; GFX10-NEXT:    v_fmac_f32_e32 v2, v4, v5
+; GFX10-NEXT:    v_mac_f32_e32 v2, v0, v1
+; GFX10-NEXT:    v_mac_f32_e32 v2, v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -294,8 +294,8 @@ define float @fmac_sequence_innermost_fmul_swapped_operands(float %a, float %b,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_f32 v2, v2, v3, v6
-; GFX10-NEXT:    v_fmac_f32_e32 v2, v0, v1
-; GFX10-NEXT:    v_fmac_f32_e32 v2, v4, v5
+; GFX10-NEXT:    v_mac_f32_e32 v2, v0, v1
+; GFX10-NEXT:    v_mac_f32_e32 v2, v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -322,8 +322,8 @@ define amdgpu_ps float @fmac_sequence_innermost_fmul_sgpr(float inreg %a, float
 ; GFX10-LABEL: fmac_sequence_innermost_fmul_sgpr:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_mac_f32_e64 v0, s2, s3
-; GFX10-NEXT:    v_fmac_f32_e64 v0, s0, s1
-; GFX10-NEXT:    v_fmac_f32_e64 v0, s4, s5
+; GFX10-NEXT:    v_mac_f32_e64 v0, s0, s1
+; GFX10-NEXT:    v_mac_f32_e64 v0, s4, s5
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: fmac_sequence_innermost_fmul_sgpr:
@@ -346,21 +346,23 @@ define amdgpu_ps float @fmac_sequence_innermost_fmul_multiple_use(float inreg %a
 ; GFX10-LABEL: fmac_sequence_innermost_fmul_multiple_use:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_mul_f32_e64 v1, s2, s3
+; GFX10-NEXT:    v_mac_f32_e64 v0, s2, s3
 ; GFX10-NEXT:    v_fmac_f32_e64 v1, s0, s1
-; GFX10-NEXT:    v_fma_f32 v2, s5, s4, v1
-; GFX10-NEXT:    v_fmac_f32_e32 v1, s5, v2
-; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-NEXT:    v_mac_f32_e64 v0, s0, s1
+; GFX10-NEXT:    v_fmac_f32_e64 v1, s4, s5
+; GFX10-NEXT:    v_mac_f32_e32 v0, s5, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: fmac_sequence_innermost_fmul_multiple_use:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_mul_f32_e64 v1, s2, s3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e64 v0, s2, s3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_fmac_f32_e64 v1, s0, s1
-; GFX11-NEXT:    v_fma_f32 v2, s5, s4, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f32_e32 v1, s5, v2
-; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-NEXT:    v_fmac_f32_e64 v0, s0, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e64 v1, s4, s5
+; GFX11-NEXT:    v_fmac_f32_e32 v0, s5, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %t0 = fmul fast float %a, %b
   %t1 = fmul fast float %c, %d
diff --git a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
index b36c6e707ebab..f32030e6eab88 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
@@ -172,12 +172,12 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
 ; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-FLUSH-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
 ; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GCN-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v4
-; GCN-FLUSH-NEXT:    buffer_store_dword v3, off, s[0:3], 0
-; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-FLUSH-NEXT:    v_mac_f32_e32 v3, v0, v1
-; GCN-FLUSH-NEXT:    v_add_f32_e32 v0, v3, v2
-; GCN-FLUSH-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-FLUSH-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GCN-FLUSH-NEXT:    v_mac_f32_e32 v2, v3, v4
+; GCN-FLUSH-NEXT:    buffer_store_dword v5, off, s[0:3], 0
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    v_mac_f32_e32 v2, v0, v1
+; GCN-FLUSH-NEXT:    buffer_store_dword v2, off, s[0:3], 0
 ; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-FLUSH-NEXT:    s_endpgm
 ;
@@ -195,11 +195,11 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
 ; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-FASTFMA-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
 ; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
-; GCN-FASTFMA-NEXT:    v_mul_f32_e32 v3, v3, v4
-; GCN-FASTFMA-NEXT:    buffer_store_dword v3, off, s[0:3], 0
+; GCN-FASTFMA-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GCN-FASTFMA-NEXT:    v_fma_f32 v2, v3, v4, v2
+; GCN-FASTFMA-NEXT:    buffer_store_dword v5, off, s[0:3], 0
 ; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
-; GCN-FASTFMA-NEXT:    v_fma_f32 v0, v0, v1, v3
-; GCN-FASTFMA-NEXT:    v_add_f32_e32 v0, v0, v2
+; GCN-FASTFMA-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GCN-FASTFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-FASTFMA-NEXT:    s_endpgm
@@ -255,12 +255,12 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 {
 ; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-FLUSH-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
 ; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GCN-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v4
-; GCN-FLUSH-NEXT:    buffer_store_dword v3, off, s[0:3], 0
-; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-FLUSH-NEXT:    v_mac_f32_e32 v3, v0, v1
-; GCN-FLUSH-NEXT:    v_add_f32_e32 v0, v2, v3
-; GCN-FLUSH-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-FLUSH-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GCN-FLUSH-NEXT:    v_mac_f32_e32 v2, v3, v4
+; GCN-FLUSH-NEXT:    buffer_store_dword v5, off, s[0:3], 0
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    v_mac_f32_e32 v2, v0, v1
+; GCN-FLUSH-NEXT:    buffer_store_dword v2, off, s[0:3], 0
 ; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-FLUSH-NEXT:    s_endpgm
 ;
@@ -278,11 +278,11 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 {
 ; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-FASTFMA-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
 ; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
-; GCN-FASTFMA-NEXT:    v_mul_f32_e32 v3, v3, v4
-; GCN-FASTFMA-NEXT:    buffer_store_dword v3, off, s[0:3], 0
+; GCN-FASTFMA-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GCN-FASTFMA-NEXT:    v_fma_f32 v2, v3, v4, v2
+; GCN-FASTFMA-NEXT:    buffer_store_dword v5, off, s[0:3], 0
 ; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
-; GCN-FASTFMA-NEXT:    v_fma_f32 v0, v0, v1, v3
-; GCN-FASTFMA-NEXT:    v_add_f32_e32 v0, v2, v0
+; GCN-FASTFMA-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GCN-FASTFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-FASTFMA-NEXT:    s_endpgm
@@ -338,12 +338,13 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 {
 ; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-FLUSH-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
 ; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GCN-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v4
-; GCN-FLUSH-NEXT:    v_mac_f32_e32 v3, v0, v1
-; GCN-FLUSH-NEXT:    buffer_store_dword v3, off, s[0:3], 0
+; GCN-FLUSH-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GCN-FLUSH-NEXT:    v_mac_f32_e32 v2, v3, v4
+; GCN-FLUSH-NEXT:    v_mac_f32_e32 v5, v0, v1
+; GCN-FLUSH-NEXT:    v_mac_f32_e32 v2, v0, v1
+; GCN-FLUSH-NEXT:    buffer_store_dword v5, off, s[0:3], 0
 ; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GCN-FLUSH-NEXT:    v_add_f32_e32 v0, v3, v2
-; GCN-FLUSH-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-FLUSH-NEXT:    buffer_store_dword v2, off, s[0:3], 0
 ; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-FLUSH-NEXT:    s_endpgm
 ;
@@ -361,11 +362,12 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 {
 ; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-FASTFMA-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
 ; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
-; GCN-FASTFMA-NEXT:    v_mul_f32_e32 v3, v3, v4
-; GCN-FASTFMA-NEXT:    v_fma_f32 v0, v0, v1, v3
-; GCN-FASTFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-FASTFMA-NEXT:    v_add_f32_e32 v0, v0, v2
+; GCN-FASTFMA-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GCN-FASTFMA-NEXT:    v_fma_f32 v2, v3, v4, v2
+; GCN-FASTFMA-NEXT:    v_fma_f32 v3, v0, v1, v5
+; GCN-FASTFMA-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GCN-FASTFMA-NEXT:    buffer_store_dword v3, off, s[0:3], 0
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-FASTFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-FASTFMA-NEXT:    s_endpgm
@@ -421,12 +423,13 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0
 ; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-FLUSH-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
 ; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GCN-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v4
-; GCN-FLUSH-NEXT:    v_mac_f32_e32 v3, v0, v1
-; GCN-FLUSH-NEXT:    buffer_store_dword v3, off, s[0:3], 0
+; GCN-FLUSH-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GCN-FLUSH-NEXT:    v_mac_f32_e32 v2, v3, v4
+; GCN-FLUSH-NEXT:    v_mac_f32_e32 v5, v0, v1
+; GCN-FLUSH-NEXT:    v_mac_f32_e32 v2, v0, v1
+; GCN-FLUSH-NEXT:    buffer_store_dword v5, off, s[0:3], 0
 ; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GCN-FLUSH-NEXT:    v_add_f32_e32 v0, v2, v3
-; GCN-FLUSH-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-FLUSH-NEXT:    buffer_store_dword v2, off, s[0:3], 0
 ; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-FLUSH-NEXT:    s_endpgm
 ;
@@ -444,11 +447,12 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0
 ; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-FASTFMA-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
 ; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
-; GCN-FASTFMA-NEXT:    v_mul_f32_e32 v3, v3, v4
-; GCN-FASTFMA-NEXT:    v_fma_f32 v0, v0, v1, v3
-; GCN-FASTFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-FASTFMA-NEXT:    v_add_f32_e32 v0, v2, v0
+; GCN-FASTFMA-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GCN-FASTFMA-NEXT:    v_fma_f32 v2, v3, v4, v2
+; GCN-FASTFMA-NEXT:    v_fma_f32 v3, v0, v1, v5
+; GCN-FASTFMA-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GCN-FASTFMA-NEXT:    buffer_store_dword v3, off, s[0:3], 0
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-FASTFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-FASTFMA-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/PowerPC/fma-precision.ll b/llvm/test/CodeGen/PowerPC/fma-precision.ll
index 762d2336e2932..bf592589e3261 100644
--- a/llvm/test/CodeGen/PowerPC/fma-precision.ll
+++ b/llvm/test/CodeGen/PowerPC/fma-precision.ll
@@ -140,15 +140,17 @@ define double @fma_multi_uses2(double %a, double %b, double %c, double %d, ptr %
 define double @fma_multi_uses3(double %a, double %b, double %c, double %d, double %f, double %g, ptr %p1, ptr %p2, ptr %p3) {
 ; CHECK-LABEL: fma_multi_uses3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xsmuldp 0, 1, 2
-; CHECK-NEXT:    xsmuldp 1, 5, 6
+; CHECK-NEXT:    xsmuldp 7, 1, 2
+; CHECK-NEXT:    xsmuldp 0, 5, 6
 ; CHECK-NEXT:    ld 3, 96(1)
-; CHECK-NEXT:    stfd 0, 0(9)
-; CHECK-NEXT:    stfd 0, 0(10)
-; CHECK-NEXT:    stfd 1, 0(3)
-; CHECK-NEXT:    xsnmsubadp 1, 3, 4
+; CHECK-NEXT:    stfd 7, 0(9)
+; CHECK-NEXT:    stfd 7, 0(10)
+; CHECK-NEXT:    stfd 0, 0(3)
 ; CHECK-NEXT:    xsnmsubadp 0, 3, 4
-; CHECK-NEXT:    xsadddp 1, 0, 1
+; CHECK-NEXT:    xsnegdp 3, 3
+; CHECK-NEXT:    xsmaddadp 0, 1, 2
+; CHECK-NEXT:    xsmaddadp 0, 3, 4
+; CHECK-NEXT:    fmr 1, 0
 ; CHECK-NEXT:    blr
   %ab = fmul contract reassoc double %a, %b
   %cd = fmul contract reassoc double %c, %d

>From 3cd802cb1bbf5bace12245f59013e8658b3c8ed6 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Mon, 10 Jun 2024 17:18:21 +0100
Subject: [PATCH 2/2] Stop converting existing FMAs to FMAD

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp   |  4 ++--
 llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll | 16 ++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d87f9a9a2977c..cf8a682327c00 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15900,8 +15900,8 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
         Tmp = matcher.getNode(PreferredFusedOpcode, SL, VT, Tmp.getOperand(0),
                               Tmp.getOperand(1), E);
         for (SDNode *FMA : reverse(FMAs))
-          Tmp = matcher.getNode(PreferredFusedOpcode, SL, VT,
-                                FMA->getOperand(0), FMA->getOperand(1), Tmp);
+          Tmp = matcher.getNode(FMA->getOpcode(), SL, VT, FMA->getOperand(0),
+                                FMA->getOperand(1), Tmp);
         return Tmp;
       }
     }
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
index 67331bdab1ee4..c4d7a5c29f2b6 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
@@ -265,8 +265,8 @@ define float @fmac_sequence_innermost_fmul(float %a, float %b, float %c, float %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_f32 v2, v2, v3, v6
-; GFX10-NEXT:    v_mac_f32_e32 v2, v0, v1
-; GFX10-NEXT:    v_mac_f32_e32 v2, v4, v5
+; GFX10-NEXT:    v_fmac_f32_e32 v2, v0, v1
+; GFX10-NEXT:    v_fmac_f32_e32 v2, v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -294,8 +294,8 @@ define float @fmac_sequence_innermost_fmul_swapped_operands(float %a, float %b,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_f32 v2, v2, v3, v6
-; GFX10-NEXT:    v_mac_f32_e32 v2, v0, v1
-; GFX10-NEXT:    v_mac_f32_e32 v2, v4, v5
+; GFX10-NEXT:    v_fmac_f32_e32 v2, v0, v1
+; GFX10-NEXT:    v_fmac_f32_e32 v2, v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -322,8 +322,8 @@ define amdgpu_ps float @fmac_sequence_innermost_fmul_sgpr(float inreg %a, float
 ; GFX10-LABEL: fmac_sequence_innermost_fmul_sgpr:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_mac_f32_e64 v0, s2, s3
-; GFX10-NEXT:    v_mac_f32_e64 v0, s0, s1
-; GFX10-NEXT:    v_mac_f32_e64 v0, s4, s5
+; GFX10-NEXT:    v_fmac_f32_e64 v0, s0, s1
+; GFX10-NEXT:    v_fmac_f32_e64 v0, s4, s5
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: fmac_sequence_innermost_fmul_sgpr:
@@ -348,9 +348,9 @@ define amdgpu_ps float @fmac_sequence_innermost_fmul_multiple_use(float inreg %a
 ; GFX10-NEXT:    v_mul_f32_e64 v1, s2, s3
 ; GFX10-NEXT:    v_mac_f32_e64 v0, s2, s3
 ; GFX10-NEXT:    v_fmac_f32_e64 v1, s0, s1
-; GFX10-NEXT:    v_mac_f32_e64 v0, s0, s1
+; GFX10-NEXT:    v_fmac_f32_e64 v0, s0, s1
 ; GFX10-NEXT:    v_fmac_f32_e64 v1, s4, s5
-; GFX10-NEXT:    v_mac_f32_e32 v0, s5, v1
+; GFX10-NEXT:    v_fmac_f32_e32 v0, s5, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: fmac_sequence_innermost_fmul_multiple_use:



More information about the llvm-commits mailing list