[llvm] [AArch64] Sink fneg instruction to unlock fmls combine (PR #172000)
Valeriy Savchenko via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 12 04:56:07 PST 2025
https://github.com/SavchenkoValeriy updated https://github.com/llvm/llvm-project/pull/172000
>From e553a3cef97d552a815606ae1d67509a334feb8c Mon Sep 17 00:00:00 2001
From: Valeriy Savchenko <vsavchenko at apple.com>
Date: Fri, 12 Dec 2025 11:39:03 +0000
Subject: [PATCH 1/2] [AArch64][NFC] Add test for suboptimal fmls combine
---
llvm/test/CodeGen/AArch64/sink-fneg.ll | 140 +++++++++++++++++++++++++
1 file changed, 140 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/sink-fneg.ll
diff --git a/llvm/test/CodeGen/AArch64/sink-fneg.ll b/llvm/test/CodeGen/AArch64/sink-fneg.ll
new file mode 100644
index 0000000000000..e428c8e4b2bbb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sink-fneg.ll
@@ -0,0 +1,140 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+
+
+define void @shared_fneg_across_bbs(<4 x float> %x, <4 x float> %y, <4 x float> %z,
+; CHECK-LABEL: shared_fneg_across_bbs:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmls v2.4s, v1.4s, v0.4s
+; CHECK-NEXT: str q2, [x1]
+; CHECK-NEXT: tbz w0, #0, .LBB0_2
+; CHECK-NEXT: // %bb.1: // %use_bb
+; CHECK-NEXT: fneg v0.4s, v0.4s
+; CHECK-NEXT: fmla v4.4s, v0.4s, v3.4s
+; CHECK-NEXT: str q4, [x2]
+; CHECK-NEXT: .LBB0_2: // %exit
+; CHECK-NEXT: ret
+ <4 x float> %a, <4 x float> %b,
+ i1 %cond, ptr %out1, ptr %out2) {
+entry:
+ %neg = fneg <4 x float> %x
+ %r1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %y, <4 x float> %neg, <4 x float> %z)
+ store <4 x float> %r1, ptr %out1
+ br i1 %cond, label %use_bb, label %exit
+
+use_bb:
+ %r2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %neg, <4 x float> %b)
+ store <4 x float> %r2, ptr %out2
+ br label %exit
+
+exit:
+ ret void
+}
+
+define void @shared_fnegs_across_bbs(<4 x float> %x, <4 x float> %y, <4 x float> %z,
+; CHECK-LABEL: shared_fnegs_across_bbs:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmla v2.4s, v0.4s, v3.4s
+; CHECK-NEXT: str q2, [x1]
+; CHECK-NEXT: tbz w0, #0, .LBB1_2
+; CHECK-NEXT: // %bb.1: // %use_bb
+; CHECK-NEXT: fneg v0.4s, v0.4s
+; CHECK-NEXT: fneg v1.4s, v3.4s
+; CHECK-NEXT: fmla v4.4s, v0.4s, v1.4s
+; CHECK-NEXT: str q4, [x2]
+; CHECK-NEXT: .LBB1_2: // %exit
+; CHECK-NEXT: ret
+ <4 x float> %a, <4 x float> %b,
+ i1 %cond, ptr %out1, ptr %out2) {
+entry:
+ %negx = fneg <4 x float> %x
+ %nega = fneg <4 x float> %a
+ %r1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %nega, <4 x float> %negx, <4 x float> %z)
+ store <4 x float> %r1, ptr %out1
+ br i1 %cond, label %use_bb, label %exit
+
+use_bb:
+ %r2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %nega, <4 x float> %negx, <4 x float> %b)
+ store <4 x float> %r2, ptr %out2
+ br label %exit
+
+exit:
+ ret void
+}
+
+define <4 x float> @shared_fneg_with_other_users(<4 x float> %x, <4 x float> %y, <4 x float> %z,
+; CHECK-LABEL: shared_fneg_with_other_users:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmls v2.4s, v1.4s, v0.4s
+; CHECK-NEXT: fneg v0.4s, v0.4s
+; CHECK-NEXT: str q2, [x1]
+; CHECK-NEXT: tbz w0, #0, .LBB2_2
+; CHECK-NEXT: // %bb.1: // %use_bb
+; CHECK-NEXT: fmla v4.4s, v0.4s, v3.4s
+; CHECK-NEXT: str q4, [x2]
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB2_2: // %other_use
+; CHECK-NEXT: sub sp, sp, #32
+; CHECK-NEXT: str x30, [sp, #16] // 8-byte Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: str q0, [sp] // 16-byte Spill
+; CHECK-NEXT: ldr q0, [sp] // 16-byte Reload
+; CHECK-NEXT: bl foo
+; CHECK-NEXT: ldr q0, [sp] // 16-byte Reload
+; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Reload
+; CHECK-NEXT: add sp, sp, #32
+; CHECK-NEXT: ret
+ <4 x float> %a, <4 x float> %b,
+ i1 %cond, ptr %out1, ptr %out2) {
+entry:
+ %neg = fneg <4 x float> %x
+ %r1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %y, <4 x float> %neg, <4 x float> %z)
+ store <4 x float> %r1, ptr %out1
+ br i1 %cond, label %use_bb, label %other_use
+
+use_bb:
+ %r2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %neg, <4 x float> %b)
+ store <4 x float> %r2, ptr %out2
+ br label %exit
+
+other_use:
+ call void @foo(<4 x float> %neg)
+ br label %exit
+
+exit:
+ ret <4 x float> %neg
+}
+
+define void @shared_fneg_across_bbs_fmuladd(<4 x float> %x, <4 x float> %y, <4 x float> %z,
+; CHECK-LABEL: shared_fneg_across_bbs_fmuladd:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmls v2.4s, v1.4s, v0.4s
+; CHECK-NEXT: str q2, [x1]
+; CHECK-NEXT: tbz w0, #0, .LBB3_2
+; CHECK-NEXT: // %bb.1: // %use_bb
+; CHECK-NEXT: fneg v0.4s, v0.4s
+; CHECK-NEXT: fmla v4.4s, v0.4s, v3.4s
+; CHECK-NEXT: str q4, [x2]
+; CHECK-NEXT: .LBB3_2: // %exit
+; CHECK-NEXT: ret
+ <4 x float> %a, <4 x float> %b,
+ i1 %cond, ptr %out1, ptr %out2) {
+entry:
+ %neg = fneg <4 x float> %x
+ %r1 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %y, <4 x float> %neg, <4 x float> %z)
+ store <4 x float> %r1, ptr %out1
+ br i1 %cond, label %use_bb, label %exit
+
+use_bb:
+ %r2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %a, <4 x float> %neg, <4 x float> %b)
+ store <4 x float> %r2, ptr %out2
+ br label %exit
+
+exit:
+ ret void
+}
+
+declare void @foo(<4 x float>)
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)
>From 80c46969e03dd6b66a1bf26b826b2c5691a2a275 Mon Sep 17 00:00:00 2001
From: Valeriy Savchenko <vsavchenko at apple.com>
Date: Thu, 11 Dec 2025 12:43:23 +0000
Subject: [PATCH 2/2] [AArch64] Sink fneg instruction to unlock fmls combine
---
.../AArch64/AArch64TargetTransformInfo.cpp | 5 +++++
llvm/test/CodeGen/AArch64/sink-fneg.ll | 19 ++++++++-----------
2 files changed, 13 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 043be554f8441..1b3cfdc2a580e 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -6585,6 +6585,11 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
!ST->hasFullFP16())
return false;
+ for (unsigned I = 0; I < 2; ++I) {
+ // Sinking fnegs will unlock fmls combine pattern
+ if (match(II->getOperand(I), m_FNeg(m_Value())))
+ Ops.push_back(&II->getOperandUse(I));
+ }
[[fallthrough]];
case Intrinsic::aarch64_neon_sqdmull:
case Intrinsic::aarch64_neon_sqdmulh:
diff --git a/llvm/test/CodeGen/AArch64/sink-fneg.ll b/llvm/test/CodeGen/AArch64/sink-fneg.ll
index e428c8e4b2bbb..784f1166a7485 100644
--- a/llvm/test/CodeGen/AArch64/sink-fneg.ll
+++ b/llvm/test/CodeGen/AArch64/sink-fneg.ll
@@ -9,8 +9,7 @@ define void @shared_fneg_across_bbs(<4 x float> %x, <4 x float> %y, <4 x float>
; CHECK-NEXT: str q2, [x1]
; CHECK-NEXT: tbz w0, #0, .LBB0_2
; CHECK-NEXT: // %bb.1: // %use_bb
-; CHECK-NEXT: fneg v0.4s, v0.4s
-; CHECK-NEXT: fmla v4.4s, v0.4s, v3.4s
+; CHECK-NEXT: fmls v4.4s, v3.4s, v0.4s
; CHECK-NEXT: str q4, [x2]
; CHECK-NEXT: .LBB0_2: // %exit
; CHECK-NEXT: ret
@@ -38,9 +37,7 @@ define void @shared_fnegs_across_bbs(<4 x float> %x, <4 x float> %y, <4 x float>
; CHECK-NEXT: str q2, [x1]
; CHECK-NEXT: tbz w0, #0, .LBB1_2
; CHECK-NEXT: // %bb.1: // %use_bb
-; CHECK-NEXT: fneg v0.4s, v0.4s
-; CHECK-NEXT: fneg v1.4s, v3.4s
-; CHECK-NEXT: fmla v4.4s, v0.4s, v1.4s
+; CHECK-NEXT: fmla v4.4s, v0.4s, v3.4s
; CHECK-NEXT: str q4, [x2]
; CHECK-NEXT: .LBB1_2: // %exit
; CHECK-NEXT: ret
@@ -66,11 +63,12 @@ define <4 x float> @shared_fneg_with_other_users(<4 x float> %x, <4 x float> %y,
; CHECK-LABEL: shared_fneg_with_other_users:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmls v2.4s, v1.4s, v0.4s
-; CHECK-NEXT: fneg v0.4s, v0.4s
+; CHECK-NEXT: fneg v1.4s, v0.4s
; CHECK-NEXT: str q2, [x1]
; CHECK-NEXT: tbz w0, #0, .LBB2_2
; CHECK-NEXT: // %bb.1: // %use_bb
-; CHECK-NEXT: fmla v4.4s, v0.4s, v3.4s
+; CHECK-NEXT: fmls v4.4s, v3.4s, v0.4s
+; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: str q4, [x2]
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB2_2: // %other_use
@@ -78,8 +76,8 @@ define <4 x float> @shared_fneg_with_other_users(<4 x float> %x, <4 x float> %y,
; CHECK-NEXT: str x30, [sp, #16] // 8-byte Spill
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: str q0, [sp] // 16-byte Spill
-; CHECK-NEXT: ldr q0, [sp] // 16-byte Reload
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: str q1, [sp] // 16-byte Spill
; CHECK-NEXT: bl foo
; CHECK-NEXT: ldr q0, [sp] // 16-byte Reload
; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Reload
@@ -113,8 +111,7 @@ define void @shared_fneg_across_bbs_fmuladd(<4 x float> %x, <4 x float> %y, <4 x
; CHECK-NEXT: str q2, [x1]
; CHECK-NEXT: tbz w0, #0, .LBB3_2
; CHECK-NEXT: // %bb.1: // %use_bb
-; CHECK-NEXT: fneg v0.4s, v0.4s
-; CHECK-NEXT: fmla v4.4s, v0.4s, v3.4s
+; CHECK-NEXT: fmls v4.4s, v3.4s, v0.4s
; CHECK-NEXT: str q4, [x2]
; CHECK-NEXT: .LBB3_2: // %exit
; CHECK-NEXT: ret
More information about the llvm-commits
mailing list