[llvm] [AArch64] Sink fneg instruction to unlock fmls combine (PR #172000)

Sat Dec 13 03:28:44 PST 2025

================
@@ -0,0 +1,137 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+
+
+define void @shared_fneg_across_bbs(<4 x float> %x, <4 x float> %y, <4 x float> %z,
+; CHECK-LABEL: shared_fneg_across_bbs:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmls v2.4s, v1.4s, v0.4s
+; CHECK-NEXT:    str q2, [x1]
+; CHECK-NEXT:    tbz w0, #0, .LBB0_2
+; CHECK-NEXT:  // %bb.1: // %use_bb
+; CHECK-NEXT:    fmls v4.4s, v3.4s, v0.4s
+; CHECK-NEXT:    str q4, [x2]
+; CHECK-NEXT:  .LBB0_2: // %exit
+; CHECK-NEXT:    ret
+                                    <4 x float> %a, <4 x float> %b,
+                                    i1 %cond, ptr %out1, ptr %out2) {
+entry:
+  %neg = fneg <4 x float> %x
+  %r1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %y, <4 x float> %neg, <4 x float> %z)
+  store <4 x float> %r1, ptr %out1
+  br i1 %cond, label %use_bb, label %exit
+
+use_bb:
+  %r2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %neg, <4 x float> %b)
+  store <4 x float> %r2, ptr %out2
+  br label %exit
+
+exit:
+  ret void
+}
+
+define void @shared_fnegs_across_bbs(<4 x float> %x, <4 x float> %y, <4 x float> %z,
+; CHECK-LABEL: shared_fnegs_across_bbs:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmla v2.4s, v0.4s, v3.4s
+; CHECK-NEXT:    str q2, [x1]
+; CHECK-NEXT:    tbz w0, #0, .LBB1_2
+; CHECK-NEXT:  // %bb.1: // %use_bb
+; CHECK-NEXT:    fmla v4.4s, v0.4s, v3.4s
+; CHECK-NEXT:    str q4, [x2]
+; CHECK-NEXT:  .LBB1_2: // %exit
+; CHECK-NEXT:    ret
+                                     <4 x float> %a, <4 x float> %b,
+                                     i1 %cond, ptr %out1, ptr %out2) {
+entry:
+  %negx = fneg <4 x float> %x
+  %nega = fneg <4 x float> %a
+  %r1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %nega, <4 x float> %negx, <4 x float> %z)
+  store <4 x float> %r1, ptr %out1
+  br i1 %cond, label %use_bb, label %exit
+
+use_bb:
+  %r2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %nega, <4 x float> %negx, <4 x float> %b)
+  store <4 x float> %r2, ptr %out2
+  br label %exit
+
+exit:
+  ret void
+}
+
+define <4 x float> @shared_fneg_with_other_users(<4 x float> %x, <4 x float> %y, <4 x float> %z,
+; CHECK-LABEL: shared_fneg_with_other_users:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmls v2.4s, v1.4s, v0.4s
+; CHECK-NEXT:    fneg v1.4s, v0.4s
+; CHECK-NEXT:    str q2, [x1]
+; CHECK-NEXT:    tbz w0, #0, .LBB2_2
+; CHECK-NEXT:  // %bb.1: // %use_bb
+; CHECK-NEXT:    fmls v4.4s, v3.4s, v0.4s
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    str q4, [x2]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB2_2: // %other_use
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    str q1, [sp] // 16-byte Spill
+; CHECK-NEXT:    bl foo
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Reload
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Reload
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+                                                 <4 x float> %a, <4 x float> %b,
+                                                 i1 %cond, ptr %out1, ptr %out2) {
+entry:
+  %neg = fneg <4 x float> %x
+  %r1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %y, <4 x float> %neg, <4 x float> %z)
+  store <4 x float> %r1, ptr %out1
+  br i1 %cond, label %use_bb, label %other_use
+
+use_bb:
+  %r2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %neg, <4 x float> %b)
+  store <4 x float> %r2, ptr %out2
+  br label %exit
+
+other_use:
+  call void @foo(<4 x float> %neg)
+  br label %exit
+
+exit:
+  ret <4 x float> %neg
+}
+
+define void @shared_fneg_across_bbs_fmuladd(<4 x float> %x, <4 x float> %y, <4 x float> %z,
+; CHECK-LABEL: shared_fneg_across_bbs_fmuladd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmls v2.4s, v1.4s, v0.4s
+; CHECK-NEXT:    str q2, [x1]
+; CHECK-NEXT:    tbz w0, #0, .LBB3_2
+; CHECK-NEXT:  // %bb.1: // %use_bb
+; CHECK-NEXT:    fmls v4.4s, v3.4s, v0.4s
+; CHECK-NEXT:    str q4, [x2]
+; CHECK-NEXT:  .LBB3_2: // %exit
+; CHECK-NEXT:    ret
+                                    <4 x float> %a, <4 x float> %b,
+                                    i1 %cond, ptr %out1, ptr %out2) {
+entry:
+  %neg = fneg <4 x float> %x
+  %r1 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %y, <4 x float> %neg, <4 x float> %z)
+  store <4 x float> %r1, ptr %out1
+  br i1 %cond, label %use_bb, label %exit
+
+use_bb:
+  %r2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %a, <4 x float> %neg, <4 x float> %b)
+  store <4 x float> %r2, ptr %out2
+  br label %exit
+
+exit:
+  ret void
+}
+
+declare void @foo(<4 x float>)
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)
----------------
davemgreen wrote:

Can remove these nowadays.

https://github.com/llvm/llvm-project/pull/172000