[llvm] [AArch64] Fix a corner case with large stack allocation (PR #122038)

via llvm-commits llvm-commits at lists.llvm.org
Wed Jan 15 12:25:50 PST 2025


https://github.com/ssijaric-nv updated https://github.com/llvm/llvm-project/pull/122038

>From 386066bd563eac41ccfb6c85336c07a19e90bf66 Mon Sep 17 00:00:00 2001
From: Sanjin Sijaric <ssijaric at nvidia.com>
Date: Thu, 19 Dec 2024 12:34:12 -0800
Subject: [PATCH 1/3] [AArch64] Fix a corner case with large stack allocation

In the unlikely case where the stack size is greater than 4GB, we may run into the
situation where the local stack size and the callee saved registers stack size
get combined incorrectly when restoring the callee saved registers.  This happens
because the stack size in shouldCombineCSRLocalStackBumpInEpilogue is represented
as an 'unsigned', but is passed in as an 'int64_t'. We end up with something like

$fp, $lr = frame-destroy LDPXi $sp, 536870912

This change just makes 'shouldCombineCSRLocalStackBumpInEpilogue' match
'shouldCombineCSRLocalStackBump' where 'StackBumpBytes' is an 'uint64_t'
---
 .../Target/AArch64/AArch64FrameLowering.cpp   |  5 +-
 .../lib/Target/AArch64/AArch64FrameLowering.h |  2 +-
 .../AArch64/aarch64-large-stack-spbump.mir    | 55 +++++++++++++++++++
 3 files changed, 58 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/aarch64-large-stack-spbump.mir

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 64dfb1e39485f8..f10841b9fdcc77 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1195,10 +1195,9 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
 }
 
 bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue(
-    MachineBasicBlock &MBB, unsigned StackBumpBytes) const {
+    MachineBasicBlock &MBB, uint64_t StackBumpBytes) const {
   if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes))
     return false;
-
   if (MBB.empty())
     return true;
 
@@ -2363,7 +2362,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   }
   bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
   // Assume we can't combine the last pop with the sp restore.
-
+  //
   bool CombineAfterCSRBump = false;
   if (!CombineSPBump && PrologueSaveSize != 0) {
     MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index 20445e63bcb13e..8f84702f4d2baf 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -146,7 +146,7 @@ class AArch64FrameLowering : public TargetFrameLowering {
                                       int &MinCSFrameIndex,
                                       int &MaxCSFrameIndex) const;
   bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB,
-                                                unsigned StackBumpBytes) const;
+                                                uint64_t StackBumpBytes) const;
   void emitCalleeSavedGPRLocations(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI) const;
   void emitCalleeSavedSVELocations(MachineBasicBlock &MBB,
diff --git a/llvm/test/CodeGen/AArch64/aarch64-large-stack-spbump.mir b/llvm/test/CodeGen/AArch64/aarch64-large-stack-spbump.mir
new file mode 100644
index 00000000000000..4843dfe1d5cffa
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-large-stack-spbump.mir
@@ -0,0 +1,55 @@
+# RUN: llc -mtriple=aarch64 -run-pass=prologepilog %s -o - | FileCheck %s
+--- |
+  ; ModuleID = 'bug.c'
+  source_filename = "bug.c"
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+  target triple = "aarch64-unknown-linux-gnu"
+ 
+  ; Function Attrs: mustprogress noinline optnone uwtable
+  define dso_local noundef i32 @_Z4funcv() #0 {
+  entry:
+    %array = alloca [1073741824 x i32], align 4
+    %arrayidx = getelementptr inbounds [1073741824 x i32], ptr %array, i64 0, i64 20
+    store i32 7, ptr %arrayidx, align 4
+    call void @_Z5func2v()
+    %arrayidx1 = getelementptr inbounds [1073741824 x i32], ptr %array, i64 0, i64 20
+    %0 = load i32, ptr %arrayidx1, align 4
+    ret i32 %0
+  }
+ 
+  declare void @_Z5func2v() #1
+ 
+  attributes #0 = { mustprogress noinline optnone uwtable "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,-fmv" }
+  attributes #1 = { "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,-fmv" }
+...
+---
+name:            _Z4funcv
+alignment:       4
+legalized:       true
+regBankSelected: true
+selected:        true
+tracksRegLiveness: true
+noPhis:          true
+isSSA:           false
+noVRegs:         true
+hasFakeUses:     false
+frameInfo:
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+stack:
+  - { id: 0, name: array, size: 4294967296, alignment: 4, local-offset: -4294967296 }
+machineFunctionInfo: {}
+body:             |
+  bb.1.entry:
+    renamable $w8 = MOVi32imm 7
+    STRWui killed renamable $w8, %stack.0.array, 20 :: (store (s32) into %ir.arrayidx)
+    ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    BL @_Z5func2v, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp
+    ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    renamable $w0 = LDRWui %stack.0.array, 20 :: (dereferenceable load (s32) from %ir.arrayidx1)
+    ; CHECK: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp
+    RET_ReallyLR implicit killed $w0
+
+...

>From 80cc1e5636d3ef6e44e587728a4011a5d5e6947b Mon Sep 17 00:00:00 2001
From: Sanjin Sijaric <ssijaric at nvidia.com>
Date: Sun, 12 Jan 2025 11:13:45 -0800
Subject: [PATCH 2/3] Remove the errant comment line

---
 llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index f10841b9fdcc77..f5c33498c9d499 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2362,7 +2362,6 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   }
   bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
   // Assume we can't combine the last pop with the sp restore.
-  //
   bool CombineAfterCSRBump = false;
   if (!CombineSPBump && PrologueSaveSize != 0) {
     MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());

>From 929b5ab018b781606d1317bd19c3e3cf68d60934 Mon Sep 17 00:00:00 2001
From: Sanjin Sijaric <ssijaric at nvidia.com>
Date: Wed, 15 Jan 2025 11:47:18 -0800
Subject: [PATCH 3/3] Clean up the test case

---
 .../AArch64/aarch64-large-stack-spbump.mir        | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/aarch64-large-stack-spbump.mir b/llvm/test/CodeGen/AArch64/aarch64-large-stack-spbump.mir
index 4843dfe1d5cffa..f920813f2b42d5 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-large-stack-spbump.mir
+++ b/llvm/test/CodeGen/AArch64/aarch64-large-stack-spbump.mir
@@ -1,12 +1,6 @@
 # RUN: llc -mtriple=aarch64 -run-pass=prologepilog %s -o - | FileCheck %s
 --- |
-  ; ModuleID = 'bug.c'
-  source_filename = "bug.c"
-  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
-  target triple = "aarch64-unknown-linux-gnu"
- 
-  ; Function Attrs: mustprogress noinline optnone uwtable
-  define dso_local noundef i32 @_Z4funcv() #0 {
+  define i32 @_Z4funcv() {
   entry:
     %array = alloca [1073741824 x i32], align 4
     %arrayidx = getelementptr inbounds [1073741824 x i32], ptr %array, i64 0, i64 20
@@ -17,10 +11,7 @@
     ret i32 %0
   }
  
-  declare void @_Z5func2v() #1
- 
-  attributes #0 = { mustprogress noinline optnone uwtable "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,-fmv" }
-  attributes #1 = { "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,-fmv" }
+  declare void @_Z5func2v()
 ...
 ---
 name:            _Z4funcv
@@ -49,7 +40,7 @@ body:             |
     BL @_Z5func2v, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp
     ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
     renamable $w0 = LDRWui %stack.0.array, 20 :: (dereferenceable load (s32) from %ir.arrayidx1)
-    ; CHECK: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp
+    ; CHECK: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2
     RET_ReallyLR implicit killed $w0
 
 ...



More information about the llvm-commits mailing list