[llvm] [AArch64][SME] Fix ADDVL addressing to scavenged stackslot. (PR #109674)

Mon Sep 23 08:16:12 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-aarch64

Author: Sander de Smalen (sdesmalen-arm)

<details>
<summary>Changes</summary>

In https://reviews.llvm.org/D159196 we avoided stackslot scavenging
when there was no FP available. But in the case where FP is available
we need to actually prefer using the FP over the BP.

This change affects more than just SME, but it should be a general
improvement, since any slot above the (address pointed to by) FP
is always closer to FP than BP, so it makes sense to always favour
using the FP to address it when the FP is available.

This also fixes the issue for SME where this is not just preferred
but required.

---
Full diff: https://github.com/llvm/llvm-project/pull/109674.diff


2 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64FrameLowering.cpp (+5-6) 
- (modified) llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll (+45) 


``````````diff

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 7e041b086599b9..30c3a886c40453 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2757,7 +2757,11 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
       bool FPOffsetFits = !ForSimm || FPOffset >= -256;
       PreferFP |= Offset > -FPOffset && !SVEStackSize;
 
-      if (MFI.hasVarSizedObjects()) {
+      if (FPOffset >= 0) {
+        // If the FPOffset is positive, that'll always be best, as the SP/BP
+        // will be even further away.
+        UseFP = true;
+      } if (MFI.hasVarSizedObjects()) {
         // If we have variable sized objects, we can use either FP or BP, as the
         // SP offset is unknown. We can use the base pointer if we have one and
         // FP is not preferred. If not, we're stuck with using FP.
@@ -2769,11 +2773,6 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
         // else we can use BP and FP, but the offset from FP won't fit.
         // That will make us scavenge registers which we can probably avoid by
         // using BP. If it won't fit for BP either, we'll scavenge anyway.
-      } else if (FPOffset >= 0) {
-        // Use SP or FP, whichever gives us the best chance of the offset
-        // being in range for direct access. If the FPOffset is positive,
-        // that'll always be best, as the SP will be even further away.
-        UseFP = true;
       } else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) {
         // Funclets access the locals contained in the parent's stack frame
         // via the frame pointer, so we have to use the FP in the parent
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
index ac19bd59babe46..803bb9fda458b9 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
@@ -45,6 +45,51 @@ define void @test_no_stackslot_scavenging(float %f) #0 {
   ret void
 }
 
+define void @test_no_stackslot_scavenging_with_fp(float %f, i64 %n) #0 "frame-pointer"="all" {
+; CHECK-LABEL: test_no_stackslot_scavenging_with_fp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-128]! // 16-byte Folded Spill
+; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    add x29, sp, #64
+; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x28, x25, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x24, x19, [sp, #112] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    lsl x9, x0, #3
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    str s0, [x29, #28] // 4-byte Folded Spill
+; CHECK-NEXT:    add x9, x9, #15
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr s0, [x29, #28] // 4-byte Folded Reload
+; CHECK-NEXT:    bl use_f
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    sub sp, x29, #64
+; CHECK-NEXT:    ldp x24, x19, [sp, #112] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x28, x25, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #128 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %ptr2 = alloca i64, i64 %n, align 8
+  %ptr = alloca <vscale x 16 x i8>
+  call void asm sideeffect "", "~{x24},~{x25}"() nounwind
+  call void @use_f(float %f)
+  ret void
+}
+
 declare void @use_f(float)
+declare void @use_f_and_ptr(float, ptr)
 
 attributes #0 = { nounwind "target-features"="+sve,+sme" "aarch64_pstate_sm_enabled" }

``````````

</details>


https://github.com/llvm/llvm-project/pull/109674