[llvm] [AArch64] Sink fneg instruction to unlock fmls combine (PR #172000)

Fri Dec 12 04:56:07 PST 2025

https://github.com/SavchenkoValeriy updated https://github.com/llvm/llvm-project/pull/172000

>From e553a3cef97d552a815606ae1d67509a334feb8c Mon Sep 17 00:00:00 2001
From: Valeriy Savchenko <vsavchenko at apple.com>
Date: Fri, 12 Dec 2025 11:39:03 +0000
Subject: [PATCH 1/2] [AArch64][NFC] Add test for suboptimal fmls combine

---
 llvm/test/CodeGen/AArch64/sink-fneg.ll | 140 +++++++++++++++++++++++++
 1 file changed, 140 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/sink-fneg.ll

diff --git a/llvm/test/CodeGen/AArch64/sink-fneg.ll b/llvm/test/CodeGen/AArch64/sink-fneg.ll
new file mode 100644
index 0000000000000..e428c8e4b2bbb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sink-fneg.ll
@@ -0,0 +1,140 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+
+
+define void @shared_fneg_across_bbs(<4 x float> %x, <4 x float> %y, <4 x float> %z,
+; CHECK-LABEL: shared_fneg_across_bbs:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmls v2.4s, v1.4s, v0.4s
+; CHECK-NEXT:    str q2, [x1]
+; CHECK-NEXT:    tbz w0, #0, .LBB0_2
+; CHECK-NEXT:  // %bb.1: // %use_bb
+; CHECK-NEXT:    fneg v0.4s, v0.4s
+; CHECK-NEXT:    fmla v4.4s, v0.4s, v3.4s
+; CHECK-NEXT:    str q4, [x2]
+; CHECK-NEXT:  .LBB0_2: // %exit
+; CHECK-NEXT:    ret
+                                    <4 x float> %a, <4 x float> %b,
+                                    i1 %cond, ptr %out1, ptr %out2) {
+entry:
+  %neg = fneg <4 x float> %x
+  %r1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %y, <4 x float> %neg, <4 x float> %z)
+  store <4 x float> %r1, ptr %out1
+  br i1 %cond, label %use_bb, label %exit
+
+use_bb:
+  %r2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %neg, <4 x float> %b)
+  store <4 x float> %r2, ptr %out2
+  br label %exit
+
+exit:
+  ret void
+}
+
+define void @shared_fnegs_across_bbs(<4 x float> %x, <4 x float> %y, <4 x float> %z,
+; CHECK-LABEL: shared_fnegs_across_bbs:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmla v2.4s, v0.4s, v3.4s
+; CHECK-NEXT:    str q2, [x1]
+; CHECK-NEXT:    tbz w0, #0, .LBB1_2
+; CHECK-NEXT:  // %bb.1: // %use_bb
+; CHECK-NEXT:    fneg v0.4s, v0.4s
+; CHECK-NEXT:    fneg v1.4s, v3.4s
+; CHECK-NEXT:    fmla v4.4s, v0.4s, v1.4s
+; CHECK-NEXT:    str q4, [x2]
+; CHECK-NEXT:  .LBB1_2: // %exit
+; CHECK-NEXT:    ret
+                                     <4 x float> %a, <4 x float> %b,
+                                     i1 %cond, ptr %out1, ptr %out2) {
+entry:
+  %negx = fneg <4 x float> %x
+  %nega = fneg <4 x float> %a
+  %r1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %nega, <4 x float> %negx, <4 x float> %z)
+  store <4 x float> %r1, ptr %out1
+  br i1 %cond, label %use_bb, label %exit
+
+use_bb:
+  %r2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %nega, <4 x float> %negx, <4 x float> %b)
+  store <4 x float> %r2, ptr %out2
+  br label %exit
+
+exit:
+  ret void
+}
+
+define <4 x float> @shared_fneg_with_other_users(<4 x float> %x, <4 x float> %y, <4 x float> %z,
+; CHECK-LABEL: shared_fneg_with_other_users:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmls v2.4s, v1.4s, v0.4s
+; CHECK-NEXT:    fneg v0.4s, v0.4s
+; CHECK-NEXT:    str q2, [x1]
+; CHECK-NEXT:    tbz w0, #0, .LBB2_2
+; CHECK-NEXT:  // %bb.1: // %use_bb
+; CHECK-NEXT:    fmla v4.4s, v0.4s, v3.4s
+; CHECK-NEXT:    str q4, [x2]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB2_2: // %other_use
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    str q0, [sp] // 16-byte Spill
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Reload
+; CHECK-NEXT:    bl foo
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Reload
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Reload
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+                                                 <4 x float> %a, <4 x float> %b,
+                                                 i1 %cond, ptr %out1, ptr %out2) {
+entry:
+  %neg = fneg <4 x float> %x
+  %r1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %y, <4 x float> %neg, <4 x float> %z)
+  store <4 x float> %r1, ptr %out1
+  br i1 %cond, label %use_bb, label %other_use
+
+use_bb:
+  %r2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %neg, <4 x float> %b)
+  store <4 x float> %r2, ptr %out2
+  br label %exit
+
+other_use:
+  call void @foo(<4 x float> %neg)
+  br label %exit
+
+exit:
+  ret <4 x float> %neg
+}
+
+define void @shared_fneg_across_bbs_fmuladd(<4 x float> %x, <4 x float> %y, <4 x float> %z,
+; CHECK-LABEL: shared_fneg_across_bbs_fmuladd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmls v2.4s, v1.4s, v0.4s
+; CHECK-NEXT:    str q2, [x1]
+; CHECK-NEXT:    tbz w0, #0, .LBB3_2
+; CHECK-NEXT:  // %bb.1: // %use_bb
+; CHECK-NEXT:    fneg v0.4s, v0.4s
+; CHECK-NEXT:    fmla v4.4s, v0.4s, v3.4s
+; CHECK-NEXT:    str q4, [x2]
+; CHECK-NEXT:  .LBB3_2: // %exit
+; CHECK-NEXT:    ret
+                                    <4 x float> %a, <4 x float> %b,
+                                    i1 %cond, ptr %out1, ptr %out2) {
+entry:
+  %neg = fneg <4 x float> %x
+  %r1 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %y, <4 x float> %neg, <4 x float> %z)
+  store <4 x float> %r1, ptr %out1
+  br i1 %cond, label %use_bb, label %exit
+
+use_bb:
+  %r2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %a, <4 x float> %neg, <4 x float> %b)
+  store <4 x float> %r2, ptr %out2
+  br label %exit
+
+exit:
+  ret void
+}
+
+declare void @foo(<4 x float>)
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)

>From 80c46969e03dd6b66a1bf26b826b2c5691a2a275 Mon Sep 17 00:00:00 2001
From: Valeriy Savchenko <vsavchenko at apple.com>
Date: Thu, 11 Dec 2025 12:43:23 +0000
Subject: [PATCH 2/2] [AArch64] Sink fneg instruction to unlock fmls combine

---
 .../AArch64/AArch64TargetTransformInfo.cpp    |  5 +++++
 llvm/test/CodeGen/AArch64/sink-fneg.ll        | 19 ++++++++-----------
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 043be554f8441..1b3cfdc2a580e 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -6585,6 +6585,11 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
           cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
           !ST->hasFullFP16())
         return false;
+      for (unsigned I = 0; I < 2; ++I) {
+        // Sinking fnegs will unlock fmls combine pattern
+        if (match(II->getOperand(I), m_FNeg(m_Value())))
+          Ops.push_back(&II->getOperandUse(I));
+      }
       [[fallthrough]];
     case Intrinsic::aarch64_neon_sqdmull:
     case Intrinsic::aarch64_neon_sqdmulh:
diff --git a/llvm/test/CodeGen/AArch64/sink-fneg.ll b/llvm/test/CodeGen/AArch64/sink-fneg.ll
index e428c8e4b2bbb..784f1166a7485 100644
--- a/llvm/test/CodeGen/AArch64/sink-fneg.ll
+++ b/llvm/test/CodeGen/AArch64/sink-fneg.ll
@@ -9,8 +9,7 @@ define void @shared_fneg_across_bbs(<4 x float> %x, <4 x float> %y, <4 x float>
 ; CHECK-NEXT:    str q2, [x1]
 ; CHECK-NEXT:    tbz w0, #0, .LBB0_2
 ; CHECK-NEXT:  // %bb.1: // %use_bb
-; CHECK-NEXT:    fneg v0.4s, v0.4s
-; CHECK-NEXT:    fmla v4.4s, v0.4s, v3.4s
+; CHECK-NEXT:    fmls v4.4s, v3.4s, v0.4s
 ; CHECK-NEXT:    str q4, [x2]
 ; CHECK-NEXT:  .LBB0_2: // %exit
 ; CHECK-NEXT:    ret
@@ -38,9 +37,7 @@ define void @shared_fnegs_across_bbs(<4 x float> %x, <4 x float> %y, <4 x float>
 ; CHECK-NEXT:    str q2, [x1]
 ; CHECK-NEXT:    tbz w0, #0, .LBB1_2
 ; CHECK-NEXT:  // %bb.1: // %use_bb
-; CHECK-NEXT:    fneg v0.4s, v0.4s
-; CHECK-NEXT:    fneg v1.4s, v3.4s
-; CHECK-NEXT:    fmla v4.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fmla v4.4s, v0.4s, v3.4s
 ; CHECK-NEXT:    str q4, [x2]
 ; CHECK-NEXT:  .LBB1_2: // %exit
 ; CHECK-NEXT:    ret
@@ -66,11 +63,12 @@ define <4 x float> @shared_fneg_with_other_users(<4 x float> %x, <4 x float> %y,
 ; CHECK-LABEL: shared_fneg_with_other_users:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmls v2.4s, v1.4s, v0.4s
-; CHECK-NEXT:    fneg v0.4s, v0.4s
+; CHECK-NEXT:    fneg v1.4s, v0.4s
 ; CHECK-NEXT:    str q2, [x1]
 ; CHECK-NEXT:    tbz w0, #0, .LBB2_2
 ; CHECK-NEXT:  // %bb.1: // %use_bb
-; CHECK-NEXT:    fmla v4.4s, v0.4s, v3.4s
+; CHECK-NEXT:    fmls v4.4s, v3.4s, v0.4s
+; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    str q4, [x2]
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB2_2: // %other_use
@@ -78,8 +76,8 @@ define <4 x float> @shared_fneg_with_other_users(<4 x float> %x, <4 x float> %y,
 ; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    str q0, [sp] // 16-byte Spill
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Reload
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    str q1, [sp] // 16-byte Spill
 ; CHECK-NEXT:    bl foo
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Reload
 ; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Reload
@@ -113,8 +111,7 @@ define void @shared_fneg_across_bbs_fmuladd(<4 x float> %x, <4 x float> %y, <4 x
 ; CHECK-NEXT:    str q2, [x1]
 ; CHECK-NEXT:    tbz w0, #0, .LBB3_2
 ; CHECK-NEXT:  // %bb.1: // %use_bb
-; CHECK-NEXT:    fneg v0.4s, v0.4s
-; CHECK-NEXT:    fmla v4.4s, v0.4s, v3.4s
+; CHECK-NEXT:    fmls v4.4s, v3.4s, v0.4s
 ; CHECK-NEXT:    str q4, [x2]
 ; CHECK-NEXT:  .LBB3_2: // %exit
 ; CHECK-NEXT:    ret