[llvm] [SDAG] Heed enableAggressiveFMAFusion when folding fma(a,b,c*d)+e (PR #94209)

via llvm-commits llvm-commits at lists.llvm.org
Mon Jun 3 03:42:08 PDT 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-backend-aarch64

Author: Jay Foad (jayfoad)

<details>
<summary>Changes</summary>

enableAggressiveFMAFusion is supposed to enable folding to fma even if
the intermediate nodes have multiple uses. Implement this for the fold:
  fma(a,b,c*d)+e -> fma(a,b,fma(c,d,e))
and its generalization to longer chains of fmas.

Since the intermediate nodes can have multiple uses we can no longer
modify them in-place, so build a new chain of fmas instead.


---

Patch is 43.60 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/94209.diff


6 Files Affected:

- (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+18-18) 
- (modified) llvm/test/CodeGen/AArch64/storepairsuppress.ll (+262-262) 
- (modified) llvm/test/CodeGen/AMDGPU/dagcombine-fma-crash.ll (+11-13) 
- (modified) llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll (+17-15) 
- (modified) llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll (+44-40) 
- (modified) llvm/test/CodeGen/PowerPC/fma-precision.ll (+9-7) 


``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 5148b7258257f..d87f9a9a2977c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15881,29 +15881,29 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   bool CanReassociate =
       Options.UnsafeFPMath || N->getFlags().hasAllowReassociation();
   if (CanReassociate) {
-    SDValue FMA, E;
-    if (isFusedOp(N0) && N0.hasOneUse()) {
-      FMA = N0;
+    SDValue Tmp, E;
+    if (isFusedOp(N0) && (Aggressive || N0.hasOneUse())) {
+      Tmp = N0;
       E = N1;
-    } else if (isFusedOp(N1) && N1.hasOneUse()) {
-      FMA = N1;
+    } else if (isFusedOp(N1) && (Aggressive || N1.hasOneUse())) {
+      Tmp = N1;
       E = N0;
     }
 
-    SDValue TmpFMA = FMA;
-    while (E && isFusedOp(TmpFMA) && TmpFMA.hasOneUse()) {
-      SDValue FMul = TmpFMA->getOperand(2);
-      if (matcher.match(FMul, ISD::FMUL) && FMul.hasOneUse()) {
-        SDValue C = FMul.getOperand(0);
-        SDValue D = FMul.getOperand(1);
-        SDValue CDE = matcher.getNode(PreferredFusedOpcode, SL, VT, C, D, E);
-        DAG.ReplaceAllUsesOfValueWith(FMul, CDE);
-        // Replacing the inner FMul could cause the outer FMA to be simplified
-        // away.
-        return FMA.getOpcode() == ISD::DELETED_NODE ? SDValue(N, 0) : FMA;
+    if (Tmp) {
+      SmallVector<SDNode *> FMAs;
+      do {
+        FMAs.push_back(Tmp.getNode());
+        Tmp = Tmp->getOperand(2);
+      } while (isFusedOp(Tmp) && (Aggressive || Tmp.hasOneUse()));
+      if (matcher.match(Tmp, ISD::FMUL) && (Aggressive || Tmp.hasOneUse())) {
+        Tmp = matcher.getNode(PreferredFusedOpcode, SL, VT, Tmp.getOperand(0),
+                              Tmp.getOperand(1), E);
+        for (SDNode *FMA : reverse(FMAs))
+          Tmp = matcher.getNode(PreferredFusedOpcode, SL, VT,
+                                FMA->getOperand(0), FMA->getOperand(1), Tmp);
+        return Tmp;
       }
-
-      TmpFMA = TmpFMA->getOperand(2);
     }
   }
 
diff --git a/llvm/test/CodeGen/AArch64/storepairsuppress.ll b/llvm/test/CodeGen/AArch64/storepairsuppress.ll
index 0571bbc278a6f..c2dc73b54a057 100644
--- a/llvm/test/CodeGen/AArch64/storepairsuppress.ll
+++ b/llvm/test/CodeGen/AArch64/storepairsuppress.ll
@@ -39,165 +39,165 @@ define void @load_store_units_critical(ptr %arg, ptr noundef %arg1, i64 noundef
 ; SUPPRESS-NEXT:    ldp s2, s3, [x8]
 ; SUPPRESS-NEXT:    ldp s4, s5, [x8, #8]
 ; SUPPRESS-NEXT:    fmul s6, s5, s1
-; SUPPRESS-NEXT:    fmul s1, s4, s1
-; SUPPRESS-NEXT:    fnmsub s4, s4, s0, s6
-; SUPPRESS-NEXT:    fmadd s0, s5, s0, s1
-; SUPPRESS-NEXT:    fadd s1, s4, s2
-; SUPPRESS-NEXT:    fadd s5, s0, s3
-; SUPPRESS-NEXT:    stp s1, s5, [x8]
-; SUPPRESS-NEXT:    fsub s2, s2, s4
-; SUPPRESS-NEXT:    fsub s0, s3, s0
+; SUPPRESS-NEXT:    fnmsub s6, s4, s0, s6
+; SUPPRESS-NEXT:    fadd s7, s6, s2
+; SUPPRESS-NEXT:    fmadd s16, s4, s1, s3
+; SUPPRESS-NEXT:    fmadd s16, s5, s0, s16
+; SUPPRESS-NEXT:    stp s7, s16, [x8]
+; SUPPRESS-NEXT:    fsub s2, s2, s6
+; SUPPRESS-NEXT:    fmsub s1, s4, s1, s3
+; SUPPRESS-NEXT:    fmsub s0, s5, s0, s1
 ; SUPPRESS-NEXT:    stp s2, s0, [x8, #8]
 ; SUPPRESS-NEXT:    ldr x9, [x0, #8]
-; SUPPRESS-NEXT:    ldp s3, s4, [x9]
-; SUPPRESS-NEXT:    ldp s6, s7, [x8, #16]
-; SUPPRESS-NEXT:    ldp s16, s17, [x8, #24]
-; SUPPRESS-NEXT:    fmul s18, s17, s4
-; SUPPRESS-NEXT:    fmul s4, s16, s4
-; SUPPRESS-NEXT:    fnmsub s16, s16, s3, s18
-; SUPPRESS-NEXT:    fmadd s3, s17, s3, s4
-; SUPPRESS-NEXT:    fadd s4, s16, s6
-; SUPPRESS-NEXT:    fadd s17, s3, s7
-; SUPPRESS-NEXT:    stp s4, s17, [x8, #16]
-; SUPPRESS-NEXT:    fsub s6, s6, s16
-; SUPPRESS-NEXT:    fsub s3, s7, s3
-; SUPPRESS-NEXT:    stp s6, s3, [x8, #24]
+; SUPPRESS-NEXT:    ldp s1, s3, [x9]
+; SUPPRESS-NEXT:    ldp s4, s5, [x8, #16]
+; SUPPRESS-NEXT:    ldp s6, s17, [x8, #24]
+; SUPPRESS-NEXT:    fmul s18, s17, s3
+; SUPPRESS-NEXT:    fnmsub s18, s6, s1, s18
+; SUPPRESS-NEXT:    fadd s19, s18, s4
+; SUPPRESS-NEXT:    fmadd s20, s6, s3, s5
+; SUPPRESS-NEXT:    fmadd s20, s17, s1, s20
+; SUPPRESS-NEXT:    stp s19, s20, [x8, #16]
+; SUPPRESS-NEXT:    fsub s4, s4, s18
+; SUPPRESS-NEXT:    fmsub s3, s6, s3, s5
+; SUPPRESS-NEXT:    fmsub s1, s17, s1, s3
+; SUPPRESS-NEXT:    stp s4, s1, [x8, #24]
 ; SUPPRESS-NEXT:    ldr x9, [x0, #8]
-; SUPPRESS-NEXT:    ldp s7, s16, [x9]
-; SUPPRESS-NEXT:    fmul s18, s16, s17
-; SUPPRESS-NEXT:    fmul s17, s7, s17
-; SUPPRESS-NEXT:    fnmsub s7, s7, s4, s18
-; SUPPRESS-NEXT:    fmadd s4, s16, s4, s17
-; SUPPRESS-NEXT:    fadd s16, s7, s1
-; SUPPRESS-NEXT:    fadd s17, s4, s5
-; SUPPRESS-NEXT:    stp s16, s17, [x8]
-; SUPPRESS-NEXT:    fsub s1, s1, s7
-; SUPPRESS-NEXT:    fsub s4, s5, s4
-; SUPPRESS-NEXT:    stp s1, s4, [x8, #16]
+; SUPPRESS-NEXT:    ldp s3, s5, [x9]
+; SUPPRESS-NEXT:    fmul s6, s5, s20
+; SUPPRESS-NEXT:    fnmsub s6, s3, s19, s6
+; SUPPRESS-NEXT:    fadd s17, s6, s7
+; SUPPRESS-NEXT:    fmadd s18, s3, s20, s16
+; SUPPRESS-NEXT:    fmadd s18, s5, s19, s18
+; SUPPRESS-NEXT:    stp s17, s18, [x8]
+; SUPPRESS-NEXT:    fsub s6, s7, s6
+; SUPPRESS-NEXT:    fmsub s3, s3, s20, s16
+; SUPPRESS-NEXT:    fmsub s3, s5, s19, s3
+; SUPPRESS-NEXT:    stp s6, s3, [x8, #16]
 ; SUPPRESS-NEXT:    ldr x10, [x0, #8]
 ; SUPPRESS-NEXT:    lsl x9, x3, #4
 ; SUPPRESS-NEXT:    add x10, x10, x9
-; SUPPRESS-NEXT:    ldp s1, s4, [x10]
-; SUPPRESS-NEXT:    fmul s5, s4, s3
-; SUPPRESS-NEXT:    fmul s3, s1, s3
-; SUPPRESS-NEXT:    fnmsub s1, s1, s6, s5
-; SUPPRESS-NEXT:    fmadd s3, s4, s6, s3
-; SUPPRESS-NEXT:    fadd s4, s1, s2
-; SUPPRESS-NEXT:    fadd s5, s3, s0
-; SUPPRESS-NEXT:    stp s4, s5, [x8, #8]
-; SUPPRESS-NEXT:    fsub s1, s2, s1
-; SUPPRESS-NEXT:    fsub s0, s0, s3
-; SUPPRESS-NEXT:    stp s1, s0, [x8, #24]
+; SUPPRESS-NEXT:    ldp s3, s5, [x10]
+; SUPPRESS-NEXT:    fmul s6, s5, s1
+; SUPPRESS-NEXT:    fnmsub s6, s3, s4, s6
+; SUPPRESS-NEXT:    fadd s7, s6, s2
+; SUPPRESS-NEXT:    fmadd s16, s3, s1, s0
+; SUPPRESS-NEXT:    fmadd s16, s5, s4, s16
+; SUPPRESS-NEXT:    stp s7, s16, [x8, #8]
+; SUPPRESS-NEXT:    fsub s2, s2, s6
+; SUPPRESS-NEXT:    fmsub s0, s3, s1, s0
+; SUPPRESS-NEXT:    fmsub s0, s5, s4, s0
+; SUPPRESS-NEXT:    stp s2, s0, [x8, #24]
 ; SUPPRESS-NEXT:    ldr x10, [x0, #8]
 ; SUPPRESS-NEXT:    ldp s0, s1, [x10]
 ; SUPPRESS-NEXT:    ldp s2, s3, [x8, #32]
 ; SUPPRESS-NEXT:    ldp s4, s5, [x8, #40]
 ; SUPPRESS-NEXT:    fmul s6, s5, s1
-; SUPPRESS-NEXT:    fmul s1, s4, s1
-; SUPPRESS-NEXT:    fnmsub s4, s4, s0, s6
-; SUPPRESS-NEXT:    fmadd s0, s5, s0, s1
-; SUPPRESS-NEXT:    fadd s1, s4, s2
-; SUPPRESS-NEXT:    fadd s5, s0, s3
-; SUPPRESS-NEXT:    stp s1, s5, [x8, #32]
-; SUPPRESS-NEXT:    fsub s2, s2, s4
-; SUPPRESS-NEXT:    fsub s3, s3, s0
-; SUPPRESS-NEXT:    stp s2, s3, [x8, #40]
+; SUPPRESS-NEXT:    fnmsub s6, s4, s0, s6
+; SUPPRESS-NEXT:    fadd s7, s6, s2
+; SUPPRESS-NEXT:    fmadd s16, s4, s1, s3
+; SUPPRESS-NEXT:    fmadd s16, s5, s0, s16
+; SUPPRESS-NEXT:    stp s7, s16, [x8, #32]
+; SUPPRESS-NEXT:    fsub s6, s2, s6
+; SUPPRESS-NEXT:    fmsub s1, s4, s1, s3
+; SUPPRESS-NEXT:    fmsub s1, s5, s0, s1
+; SUPPRESS-NEXT:    stp s6, s1, [x8, #40]
 ; SUPPRESS-NEXT:    ldr x10, [x0, #8]
-; SUPPRESS-NEXT:    ldp s0, s4, [x10]
-; SUPPRESS-NEXT:    ldp s6, s7, [x8, #48]
-; SUPPRESS-NEXT:    ldp s16, s17, [x8, #56]
-; SUPPRESS-NEXT:    fmul s18, s17, s4
-; SUPPRESS-NEXT:    fmul s4, s16, s4
-; SUPPRESS-NEXT:    fnmsub s16, s16, s0, s18
-; SUPPRESS-NEXT:    fmadd s0, s17, s0, s4
-; SUPPRESS-NEXT:    fadd s4, s16, s6
-; SUPPRESS-NEXT:    fadd s17, s0, s7
-; SUPPRESS-NEXT:    stp s4, s17, [x8, #48]
-; SUPPRESS-NEXT:    fsub s6, s6, s16
-; SUPPRESS-NEXT:    fsub s0, s7, s0
-; SUPPRESS-NEXT:    stp s6, s0, [x8, #56]
+; SUPPRESS-NEXT:    ldp s0, s2, [x10]
+; SUPPRESS-NEXT:    ldp s3, s4, [x8, #48]
+; SUPPRESS-NEXT:    ldp s5, s17, [x8, #56]
+; SUPPRESS-NEXT:    fmul s18, s17, s2
+; SUPPRESS-NEXT:    fnmsub s18, s5, s0, s18
+; SUPPRESS-NEXT:    fadd s19, s18, s3
+; SUPPRESS-NEXT:    fmadd s20, s5, s2, s4
+; SUPPRESS-NEXT:    fmadd s20, s17, s0, s20
+; SUPPRESS-NEXT:    stp s19, s20, [x8, #48]
+; SUPPRESS-NEXT:    fsub s18, s3, s18
+; SUPPRESS-NEXT:    fmsub s2, s5, s2, s4
+; SUPPRESS-NEXT:    fmsub s4, s17, s0, s2
+; SUPPRESS-NEXT:    stp s18, s4, [x8, #56]
 ; SUPPRESS-NEXT:    ldr x10, [x0, #8]
-; SUPPRESS-NEXT:    ldp s7, s16, [x10]
-; SUPPRESS-NEXT:    fmul s18, s16, s17
-; SUPPRESS-NEXT:    fmul s17, s7, s17
-; SUPPRESS-NEXT:    fnmsub s7, s7, s4, s18
-; SUPPRESS-NEXT:    fmadd s4, s16, s4, s17
-; SUPPRESS-NEXT:    fadd s16, s7, s1
-; SUPPRESS-NEXT:    fadd s17, s4, s5
-; SUPPRESS-NEXT:    stp s16, s17, [x8, #32]
-; SUPPRESS-NEXT:    fsub s7, s1, s7
-; SUPPRESS-NEXT:    fsub s4, s5, s4
-; SUPPRESS-NEXT:    stp s7, s4, [x8, #48]
+; SUPPRESS-NEXT:    ldp s0, s3, [x10]
+; SUPPRESS-NEXT:    fmul s2, s3, s20
+; SUPPRESS-NEXT:    fnmsub s2, s0, s19, s2
+; SUPPRESS-NEXT:    fadd s5, s2, s7
+; SUPPRESS-NEXT:    fmadd s17, s0, s20, s16
+; SUPPRESS-NEXT:    fmadd s17, s3, s19, s17
+; SUPPRESS-NEXT:    stp s5, s17, [x8, #32]
+; SUPPRESS-NEXT:    fsub s2, s7, s2
+; SUPPRESS-NEXT:    fmsub s0, s0, s20, s16
+; SUPPRESS-NEXT:    fmsub s3, s3, s19, s0
+; SUPPRESS-NEXT:    stp s2, s3, [x8, #48]
 ; SUPPRESS-NEXT:    ldr x10, [x0, #8]
 ; SUPPRESS-NEXT:    add x9, x10, x9
-; SUPPRESS-NEXT:    ldp s1, s5, [x9]
-; SUPPRESS-NEXT:    fmul s18, s5, s0
-; SUPPRESS-NEXT:    fmul s0, s1, s0
-; SUPPRESS-NEXT:    fnmsub s1, s1, s6, s18
-; SUPPRESS-NEXT:    fmadd s5, s5, s6, s0
-; SUPPRESS-NEXT:    fadd s6, s1, s2
-; SUPPRESS-NEXT:    fadd s18, s5, s3
-; SUPPRESS-NEXT:    stp s6, s18, [x8, #40]
-; SUPPRESS-NEXT:    fsub s0, s2, s1
-; SUPPRESS-NEXT:    fsub s1, s3, s5
+; SUPPRESS-NEXT:    ldp s7, s16, [x9]
+; SUPPRESS-NEXT:    fmul s0, s16, s4
+; SUPPRESS-NEXT:    fnmsub s0, s7, s18, s0
+; SUPPRESS-NEXT:    fadd s19, s0, s6
+; SUPPRESS-NEXT:    fmadd s20, s7, s4, s1
+; SUPPRESS-NEXT:    fmadd s20, s16, s18, s20
+; SUPPRESS-NEXT:    stp s19, s20, [x8, #40]
+; SUPPRESS-NEXT:    fsub s0, s6, s0
+; SUPPRESS-NEXT:    fmsub s1, s7, s4, s1
+; SUPPRESS-NEXT:    fmsub s1, s16, s18, s1
 ; SUPPRESS-NEXT:    stp s0, s1, [x8, #56]
 ; SUPPRESS-NEXT:    ldr x9, [x0, #8]
-; SUPPRESS-NEXT:    ldp s2, s3, [x9]
-; SUPPRESS-NEXT:    ldp s5, s19, [x8]
-; SUPPRESS-NEXT:    fmul s20, s17, s3
-; SUPPRESS-NEXT:    fmul s3, s16, s3
-; SUPPRESS-NEXT:    fnmsub s16, s16, s2, s20
-; SUPPRESS-NEXT:    fmadd s2, s17, s2, s3
-; SUPPRESS-NEXT:    fadd s3, s16, s5
-; SUPPRESS-NEXT:    fadd s17, s2, s19
-; SUPPRESS-NEXT:    stp s3, s17, [x8]
-; SUPPRESS-NEXT:    fsub s3, s5, s16
-; SUPPRESS-NEXT:    fsub s2, s19, s2
-; SUPPRESS-NEXT:    stp s3, s2, [x8, #32]
+; SUPPRESS-NEXT:    ldp s4, s6, [x9]
+; SUPPRESS-NEXT:    ldp s7, s16, [x8]
+; SUPPRESS-NEXT:    fmul s18, s17, s6
+; SUPPRESS-NEXT:    fnmsub s18, s5, s4, s18
+; SUPPRESS-NEXT:    fadd s21, s18, s7
+; SUPPRESS-NEXT:    fmadd s22, s5, s6, s16
+; SUPPRESS-NEXT:    fmadd s22, s17, s4, s22
+; SUPPRESS-NEXT:    stp s21, s22, [x8]
+; SUPPRESS-NEXT:    fsub s7, s7, s18
+; SUPPRESS-NEXT:    fmsub s5, s5, s6, s16
+; SUPPRESS-NEXT:    fmsub s4, s17, s4, s5
+; SUPPRESS-NEXT:    stp s7, s4, [x8, #32]
 ; SUPPRESS-NEXT:    ldr x9, [x0, #8]
 ; SUPPRESS-NEXT:    add x9, x9, w3, sxtw #3
-; SUPPRESS-NEXT:    ldp s2, s3, [x9]
-; SUPPRESS-NEXT:    ldp s5, s16, [x8, #8]
-; SUPPRESS-NEXT:    fmul s17, s18, s3
-; SUPPRESS-NEXT:    fmul s3, s6, s3
-; SUPPRESS-NEXT:    fnmsub s6, s6, s2, s17
-; SUPPRESS-NEXT:    fmadd s2, s18, s2, s3
-; SUPPRESS-NEXT:    fadd s3, s6, s5
-; SUPPRESS-NEXT:    fadd s17, s2, s16
-; SUPPRESS-NEXT:    stp s3, s17, [x8, #8]
-; SUPPRESS-NEXT:    fsub s3, s5, s6
-; SUPPRESS-NEXT:    fsub s2, s16, s2
-; SUPPRESS-NEXT:    stp s3, s2, [x8, #40]
+; SUPPRESS-NEXT:    ldp s4, s5, [x9]
+; SUPPRESS-NEXT:    ldp s6, s7, [x8, #8]
+; SUPPRESS-NEXT:    fmul s16, s20, s5
+; SUPPRESS-NEXT:    fnmsub s16, s19, s4, s16
+; SUPPRESS-NEXT:    fadd s17, s16, s6
+; SUPPRESS-NEXT:    fmadd s18, s19, s5, s7
+; SUPPRESS-NEXT:    fmadd s18, s20, s4, s18
+; SUPPRESS-NEXT:    stp s17, s18, [x8, #8]
+; SUPPRESS-NEXT:    fsub s6, s6, s16
+; SUPPRESS-NEXT:    fmsub s5, s19, s5, s7
+; SUPPRESS-NEXT:    fmsub s4, s20, s4, s5
+; SUPPRESS-NEXT:    stp s6, s4, [x8, #40]
 ; SUPPRESS-NEXT:    lsl x9, x3, #33
 ; SUPPRESS-NEXT:    ldr x10, [x0, #8]
 ; SUPPRESS-NEXT:    add x9, x10, x9, asr #29
-; SUPPRESS-NEXT:    ldp s2, s3, [x9]
-; SUPPRESS-NEXT:    ldp s5, s6, [x8, #16]
-; SUPPRESS-NEXT:    fmul s16, s4, s3
-; SUPPRESS-NEXT:    fmul s3, s7, s3
-; SUPPRESS-NEXT:    fnmsub s7, s7, s2, s16
-; SUPPRESS-NEXT:    fmadd s2, s4, s2, s3
-; SUPPRESS-NEXT:    fadd s3, s7, s5
-; SUPPRESS-NEXT:    fadd s4, s2, s6
-; SUPPRESS-NEXT:    stp s3, s4, [x8, #16]
-; SUPPRESS-NEXT:    fsub s3, s5, s7
-; SUPPRESS-NEXT:    fsub s2, s6, s2
-; SUPPRESS-NEXT:    stp s3, s2, [x8, #48]
+; SUPPRESS-NEXT:    ldp s4, s5, [x9]
+; SUPPRESS-NEXT:    ldp s6, s7, [x8, #16]
+; SUPPRESS-NEXT:    fmul s16, s3, s5
+; SUPPRESS-NEXT:    fnmsub s16, s2, s4, s16
+; SUPPRESS-NEXT:    fadd s17, s16, s6
+; SUPPRESS-NEXT:    fmadd s18, s2, s5, s7
+; SUPPRESS-NEXT:    fmadd s18, s3, s4, s18
+; SUPPRESS-NEXT:    stp s17, s18, [x8, #16]
+; SUPPRESS-NEXT:    fsub s6, s6, s16
+; SUPPRESS-NEXT:    fmsub s2, s2, s5, s7
+; SUPPRESS-NEXT:    fmsub s2, s3, s4, s2
+; SUPPRESS-NEXT:    stp s6, s2, [x8, #48]
 ; SUPPRESS-NEXT:    add w9, w3, w3, lsl #1
 ; SUPPRESS-NEXT:    ldr x10, [x0, #8]
 ; SUPPRESS-NEXT:    add x9, x10, w9, sxtw #3
 ; SUPPRESS-NEXT:    ldp s2, s3, [x9]
 ; SUPPRESS-NEXT:    ldp s4, s5, [x8, #24]
 ; SUPPRESS-NEXT:    fmul s6, s1, s3
-; SUPPRESS-NEXT:    fmul s3, s0, s3
-; SUPPRESS-NEXT:    fnmsub s0, s0, s2, s6
-; SUPPRESS-NEXT:    fmadd s1, s1, s2, s3
-; SUPPRESS-NEXT:    fadd s2, s0, s4
-; SUPPRESS-NEXT:    fadd s3, s1, s5
-; SUPPRESS-NEXT:    stp s2, s3, [x8, #24]
-; SUPPRESS-NEXT:    fsub s0, s4, s0
-; SUPPRESS-NEXT:    fsub s1, s5, s1
-; SUPPRESS-NEXT:    stp s0, s1, [x8, #56]
+; SUPPRESS-NEXT:    fnmsub s6, s0, s2, s6
+; SUPPRESS-NEXT:    fadd s7, s6, s4
+; SUPPRESS-NEXT:    fmadd s16, s0, s3, s5
+; SUPPRESS-NEXT:    fmadd s16, s1, s2, s16
+; SUPPRESS-NEXT:    stp s7, s16, [x8, #24]
+; SUPPRESS-NEXT:    fsub s4, s4, s6
+; SUPPRESS-NEXT:    fmsub s0, s0, s3, s5
+; SUPPRESS-NEXT:    fmsub s0, s1, s2, s0
+; SUPPRESS-NEXT:    stp s4, s0, [x8, #56]
 ; SUPPRESS-NEXT:    ret
 ;
 ; NOSUPPRESS-LABEL: load_store_units_critical:
@@ -208,165 +208,165 @@ define void @load_store_units_critical(ptr %arg, ptr noundef %arg1, i64 noundef
 ; NOSUPPRESS-NEXT:    ldp s2, s3, [x8]
 ; NOSUPPRESS-NEXT:    ldp s4, s5, [x8, #8]
 ; NOSUPPRESS-NEXT:    fmul s6, s5, s1
-; NOSUPPRESS-NEXT:    fmul s1, s4, s1
-; NOSUPPRESS-NEXT:    fnmsub s4, s4, s0, s6
-; NOSUPPRESS-NEXT:    fmadd s0, s5, s0, s1
-; NOSUPPRESS-NEXT:    fadd s1, s4, s2
-; NOSUPPRESS-NEXT:    fadd s5, s0, s3
-; NOSUPPRESS-NEXT:    stp s1, s5, [x8]
-; NOSUPPRESS-NEXT:    fsub s2, s2, s4
-; NOSUPPRESS-NEXT:    fsub s0, s3, s0
+; NOSUPPRESS-NEXT:    fnmsub s6, s4, s0, s6
+; NOSUPPRESS-NEXT:    fadd s7, s6, s2
+; NOSUPPRESS-NEXT:    fmadd s16, s4, s1, s3
+; NOSUPPRESS-NEXT:    fmadd s16, s5, s0, s16
+; NOSUPPRESS-NEXT:    stp s7, s16, [x8]
+; NOSUPPRESS-NEXT:    fsub s2, s2, s6
+; NOSUPPRESS-NEXT:    fmsub s1, s4, s1, s3
+; NOSUPPRESS-NEXT:    fmsub s0, s5, s0, s1
 ; NOSUPPRESS-NEXT:    stp s2, s0, [x8, #8]
 ; NOSUPPRESS-NEXT:    ldr x9, [x0, #8]
-; NOSUPPRESS-NEXT:    ldp s3, s4, [x9]
-; NOSUPPRESS-NEXT:    ldp s6, s7, [x8, #16]
-; NOSUPPRESS-NEXT:    ldp s16, s17, [x8, #24]
-; NOSUPPRESS-NEXT:    fmul s18, s17, s4
-; NOSUPPRESS-NEXT:    fmul s4, s16, s4
-; NOSUPPRESS-NEXT:    fnmsub s16, s16, s3, s18
-; NOSUPPRESS-NEXT:    fmadd s3, s17, s3, s4
-; NOSUPPRESS-NEXT:    fadd s4, s16, s6
-; NOSUPPRESS-NEXT:    fadd s17, s3, s7
-; NOSUPPRESS-NEXT:    stp s4, s17, [x8, #16]
-; NOSUPPRESS-NEXT:    fsub s6, s6, s16
-; NOSUPPRESS-NEXT:    fsub s3, s7, s3
-; NOSUPPRESS-NEXT:    stp s6, s3, [x8, #24]
+; NOSUPPRESS-NEXT:    ldp s1, s3, [x9]
+; NOSUPPRESS-NEXT:    ldp s4, s5, [x8, #16]
+; NOSUPPRESS-NEXT:    ldp s6, s17, [x8, #24]
+; NOSUPPRESS-NEXT:    fmul s18, s17, s3
+; NOSUPPRESS-NEXT:    fnmsub s18, s6, s1, s18
+; NOSUPPRESS-NEXT:    fadd s19, s18, s4
+; NOSUPPRESS-NEXT:    fmadd s20, s6, s3, s5
+; NOSUPPRESS-NEXT:    fmadd s20, s17, s1, s20
+; NOSUPPRESS-NEXT:    stp s19, s20, [x8, #16]
+; NOSUPPRESS-NEXT:    fsub s4, s4, s18
+; NOSUPPRESS-NEXT:    fmsub s3, s6, s3, s5
+; NOSUPPRESS-NEXT:    fmsub s1, s17, s1, s3
+; NOSUPPRESS-NEXT:    stp s4, s1, [x8, #24]
 ; NOSUPPRESS-NEXT:    ldr x9, [x0, #8]
-; NOSUPPRESS-NEXT:    ldp s7, s16, [x9]
-; NOSUPPRESS-NEXT:    fmul s18, s16, s17
-; NOSUPPRESS-NEXT:    fmul s17, s7, s17
-; NOSUPPRESS-NEXT:    fnmsub s7, s7, s4, s18
-; NOSUPPRESS-NEXT:    fmadd s4, s16, s4, s17
-; NOSUPPRESS-NEXT:    fadd s16, s7, s1
-; NOSUPPRESS-NEXT:    fadd s17, s4, s5
-; NOSUPPRESS-NEXT:    stp s16, s17, [x8]
-; NOSUPPRESS-NEXT:    fsub s1, s1, s7
-; NOSUPPRESS-NEXT:    fsub s4, s5, s4
-; NOSUPPRESS-NEXT:    stp s1, s4, [x8, #16]
+; NOSUPPRESS-NEXT:    ldp s3, s5, [x9]
+; NOSUPPRESS-NEXT:    fmul s6, s5, s20
+; NOSUPPRESS-NEXT:    fnmsub s6, s3, s19, s6
+; NOSUPPRESS-NEXT:    fadd s17, s6, s7
+; NOSUPPRESS-NEXT:    fmadd s18, s3, s20, s16
+; NOSUPPRESS-NEXT:    fmadd s18, s5, s19, s18
+; NOSUPPRESS-NEXT:    stp s17, s18, [x8]
+; NOSUPPRESS-NEXT:    fsub s6, s7, s6
+; NOSUPPRESS-NEXT:    fmsub s3, s3, s20, s16
+; NOSUPPRESS-NEXT:    fmsub s3, s5, s19, s3
+; NOSUPPRESS-NEXT:    stp s6, s3, [x8, #16]
 ; NOSUPPRESS-NEXT:    ldr x10, [x0, #8]
 ; NOSUPPRESS-NEXT:    lsl x9, x3, #4
 ; NOSUPPRESS-NEXT:    add x10, x10, x9
-; NOSUPPRESS-NEXT:    ldp s1, s4, [x10]
-; NOSUPPRESS-NEXT:    fmul s5, s4, s3
-; NOSUPPRESS-NEXT:    fmul s3, s1, s3
-; NOSUPPRESS-NEXT:    fnmsub s1, s1, s6, s5
-; NOSUPPRESS-NEXT:    fmadd s3, s4, s6, s3
-; NOSUPPRESS-NEXT:    fadd s4, s1, s2
-; NOSUPPRESS-NEXT:    fadd s5, s3, s0
-; NOSUPPRESS-NEXT:    stp s4, s5, [x8, #8]
-; NOSUPPRESS-NEXT:    fsub s1, s2, s1
-; NOSUPPRESS-NEXT:    fsub s0, s0, s3
-; NOSUPPRESS-NEXT:    stp s1, s0, [x8, #24]
+; NOSUPPRESS-NEXT:    ldp s3, s5, [x10]
+; NOSUPPRESS-NEXT:    fmul s6, s5, s1
+; NOSUPPRESS-NEXT:    fnmsub s6, s3, s4, s6
+; NOSUPPRESS-NEXT:    fadd s7, s6, s2
+; NOSUPPRESS-NEXT:    fmadd s16, s3, s1, s0
+; NOSUPPRESS-NEXT:    fmadd s16, s5, s4, s16
+; NOSUPPRESS-NEXT:    stp s7, s16, [x8, #8]
+; NOSUPPRESS-NEXT:    fsub s2, s2, s6
+; NOSUPPRESS-NEXT:    fmsub s0, s3, s1, s0
+; NOSUPPRESS-NEXT:    fmsub s0, s5, s4, s0
+; NOSUPPRESS-NEXT:    stp s2, s0, [x8, #24]
 ; NOSUPPRESS-NEXT:    ldr x10, [x0, #8]
 ; NOSUPPRESS-NEXT:    ldp s0, s1, [x10]
 ; NOSUPPRESS-NEXT:    ldp s2, s3, [x8, #32]
 ; NOSUPPRESS-NEXT:    ldp s4, s5, [x8, #40]
 ; NOSUPPRESS-NEXT:    fmul s6, s5, s1
-; NOSUPPRESS-NEXT:    fmul s1, s4, s1
-; NOSUPPRESS-NEXT:    fnmsub s4, s4, s0, s6
-; NOSUPPRESS-NEXT:    fmadd s0, s5, s0, s1
-; NOSUPPRESS-NEXT:    fadd s1, s4, s2
-; NOSUPPRESS-NEXT:    fadd s5, s0, s3
-; NOSUPPRESS-NEXT:    stp s1, s5, [x8, #32]
-; NOSUPPRESS-NEXT:    fsub s2, s2, s4
-; NOSUPPRESS-NEXT:    fsub s3, s3, s0
-; NOSUPPRESS-NEXT:    stp s2, s3, [x8, #40]
+; NOSUPPRESS-NEXT:    fnmsub s6, s4, s0, s6
+; NOSUPPRESS-NEXT:    fadd s7, s6, s2
+; NOSUPPRESS-NEXT:    fmadd s16, s4, s1, s3
+; NOSUPPRESS-NEXT:    fmadd s16, s5, s0, s16
+; NOSUPPRESS-NEXT:    stp s7, s16, [x8, #32]
+; NOSUPPRESS-NEXT:    fsub s6, s2, s6
+; NOSUPPRESS-NEXT:    fmsub s1, s4, s1, s3
+; NOSUPPRESS-NEXT:    fmsub s1, s5, s0, s1
+; NOSUPPRESS-NEXT:    stp s6, s1, [x8, #40]
 ; NOSUPPRESS-NEXT:    ldr x10, [x0, #8]
-; NOSUPPRESS-NEXT:    ldp s0, s4, [x10]
-; NOSUPPRESS-NEXT:    ldp s6, s7, [x8, #48]
-; NOSUPPRESS-NEXT:    ldp s16, s17, [x8, #56]
-; NOSUPPRESS-NEXT:    fmul s18, s17, s4
-; NOSUPPRESS-NEXT:    fmul s4, s16, s4
-; NOSUPPRESS-NEXT:    fnmsub s16, s16, s0, s18
-; NOSUPPRESS-NEXT:    fmadd s0, s17, s0, s4
-; NOSUPPRESS-NEXT:    fadd s4, s16, s6
-; NOSUPPRESS-NEXT:    fadd s17, s0, s7
-; NOSUPPRESS-NEXT:    stp s4, s17, [x8, #48]
-; NOSUPPRESS-NEXT:    fsub s6, s6, s16
-; NOSUPPRESS-NEXT:    fsub s0, s7, s0
-; NOSUPPRESS-NEXT:    stp s6, s0, [x8, #56]
+; NOSUPPRESS-NEXT:    ldp s0, s2, [x10]
+; NOSUPPRESS-NEXT:    ldp s3, s4, [x8, #48]
+; NOSUPPRESS-NEXT:    ldp s5, s17, [x8, #56]
+; NOSUPPRESS-NEXT:    fmul s18, s17, s2
+; NOSUPPRESS-NEXT:    fnmsub s18, s5, s0, s18
+; NOSUPPRESS-NEXT:    fadd s19, s18, s3
+; NOSUPPRESS-NEXT:    fmadd s20, s5, s2, s4
+; NOSUPPRESS-NEXT:   ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/94209


More information about the llvm-commits mailing list