[llvm] [AArch64][SME] Fix ADDVL addressing to scavenged stackslot. (PR #109674)

Mon Sep 23 08:32:44 PDT 2024

https://github.com/sdesmalen-arm updated https://github.com/llvm/llvm-project/pull/109674

>From fe7a0446ff25c7edf1c4919deb24594c81c2c051 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Mon, 23 Sep 2024 15:16:00 +0100
Subject: [PATCH 1/2] Add test (which shows the issue)

---
 ...nging-call-disable-stackslot-scavenging.ll | 47 +++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
index ac19bd59babe46..3a35886a69fa23 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
@@ -45,6 +45,53 @@ define void @test_no_stackslot_scavenging(float %f) #0 {
   ret void
 }
 
+define void @test_no_stackslot_scavenging_with_fp(float %f, i64 %n) #0 "frame-pointer"="all" {
+; CHECK-LABEL: test_no_stackslot_scavenging_with_fp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-128]! // 16-byte Folded Spill
+; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    add x29, sp, #64
+; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x28, x25, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x24, x19, [sp, #112] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    lsl x9, x0, #3
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    addvl x8, x19, #1
+; CHECK-NEXT:    add x9, x9, #15
+; CHECK-NEXT:    str s0, [x8, #92] // 4-byte Folded Spill
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    addvl x8, x19, #1
+; CHECK-NEXT:    ldr s0, [x8, #92] // 4-byte Folded Reload
+; CHECK-NEXT:    bl use_f
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    sub sp, x29, #64
+; CHECK-NEXT:    ldp x24, x19, [sp, #112] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x28, x25, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #128 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %ptr2 = alloca i64, i64 %n, align 8
+  %ptr = alloca <vscale x 16 x i8>
+  call void asm sideeffect "", "~{x24},~{x25}"() nounwind
+  call void @use_f(float %f)
+  ret void
+}
+
 declare void @use_f(float)
+declare void @use_f_and_ptr(float, ptr)
 
 attributes #0 = { nounwind "target-features"="+sve,+sme" "aarch64_pstate_sm_enabled" }

>From 5571e53ab79a17eadfbcf2a2af4fb440f02d1344 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Mon, 23 Sep 2024 16:02:50 +0100
Subject: [PATCH 2/2] [AArch64][SME] Fix 'addvl' addressing to scavenged
 stackslot.

In https://reviews.llvm.org/D159196 we avoided stackslot scavenging
when there was no FP available. But in the case where FP is available
we need to actually prefer using the FP over the BP.

This change affects more than just SME, but in general it should be
an improvement, since any slot above the (address pointed to by) FP
is always closer to FP than BP, so it makes sense to favour using
the FP to address it.

This also fixes the issue for SME, where this is not just preferred
but required.
---
 llvm/lib/Target/AArch64/AArch64FrameLowering.cpp      | 11 +++++------
 ...mode-changing-call-disable-stackslot-scavenging.ll |  8 +++-----
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 7e041b086599b9..fde07d84e97f58 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2757,7 +2757,11 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
       bool FPOffsetFits = !ForSimm || FPOffset >= -256;
       PreferFP |= Offset > -FPOffset && !SVEStackSize;
 
-      if (MFI.hasVarSizedObjects()) {
+      if (FPOffset >= 0) {
+        // If the FPOffset is positive, that'll always be best, as the SP/BP
+        // will be even further away.
+        UseFP = true;
+      } else if (MFI.hasVarSizedObjects()) {
         // If we have variable sized objects, we can use either FP or BP, as the
         // SP offset is unknown. We can use the base pointer if we have one and
         // FP is not preferred. If not, we're stuck with using FP.
@@ -2769,11 +2773,6 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
         // else we can use BP and FP, but the offset from FP won't fit.
         // That will make us scavenge registers which we can probably avoid by
         // using BP. If it won't fit for BP either, we'll scavenge anyway.
-      } else if (FPOffset >= 0) {
-        // Use SP or FP, whichever gives us the best chance of the offset
-        // being in range for direct access. If the FPOffset is positive,
-        // that'll always be best, as the SP will be even further away.
-        UseFP = true;
       } else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) {
         // Funclets access the locals contained in the parent's stack frame
         // via the frame pointer, so we have to use the FP in the parent
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
index 3a35886a69fa23..803bb9fda458b9 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
@@ -60,19 +60,17 @@ define void @test_no_stackslot_scavenging_with_fp(float %f, i64 %n) #0 "frame-po
 ; CHECK-NEXT:    stp x24, x19, [sp, #112] // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    lsl x9, x0, #3
+; CHECK-NEXT:    mov x8, sp
 ; CHECK-NEXT:    mov x19, sp
-; CHECK-NEXT:    addvl x8, x19, #1
+; CHECK-NEXT:    str s0, [x29, #28] // 4-byte Folded Spill
 ; CHECK-NEXT:    add x9, x9, #15
-; CHECK-NEXT:    str s0, [x8, #92] // 4-byte Folded Spill
-; CHECK-NEXT:    mov x8, sp
 ; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
 ; CHECK-NEXT:    sub x8, x8, x9
 ; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
 ; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:    addvl x8, x19, #1
-; CHECK-NEXT:    ldr s0, [x8, #92] // 4-byte Folded Reload
+; CHECK-NEXT:    ldr s0, [x29, #28] // 4-byte Folded Reload
 ; CHECK-NEXT:    bl use_f
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    sub sp, x29, #64