[llvm] [AArch64] Fix range check for STGPostIndex (PR #117146)

Oliver Stannard via llvm-commits llvm-commits at lists.llvm.org
Thu Nov 21 04:05:23 PST 2024


https://github.com/ostannard created https://github.com/llvm/llvm-project/pull/117146

When generating function epilogues using AArch64 stack tagging, we can fold an SP update into the tag-setting loop. The loop tags 32 bytes at a time using ST2G, so the final SP update might be done either by a post indexed STG which tags the final 16 bytes of the tagged region, or by an ADD/SUB instruction after the loop. However, we were only considering the range of the ADD/SUB instructions when deciding whether to do this, and the valid immediate range for STG is slightly lower when the offset is positive, because it is a signed immediate, and must include the extra 16 bytes being tagged.

>From da334d363abb1cc4f8592bb69b62c5987168f4f7 Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard at arm.com>
Date: Thu, 21 Nov 2024 11:36:54 +0000
Subject: [PATCH 1/3] Add test showing bug

---
 .../AArch64/stack-tagging-epilogue-fold.mir   | 71 +++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/stack-tagging-epilogue-fold.mir

diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-epilogue-fold.mir b/llvm/test/CodeGen/AArch64/stack-tagging-epilogue-fold.mir
new file mode 100644
index 00000000000000..00c0bdaabaa4af
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-epilogue-fold.mir
@@ -0,0 +1,71 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64 -mattr=+mte -run-pass=prologepilog %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+  target triple = "aarch64-arm-none-eabi"
+
+  define void @F76(i32 %0, ...) #0 {
+    ret void
+  }
+
+...
+---
+name:            F76
+fixedStack:
+  - { id: 0, type: default, offset: 0, size: 4, alignment: 16, stack-id: default,
+      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+stack:
+  - { id: 0, name: '', type: default, offset: 0, size: 4080, alignment: 16,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -4080, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 1, name: '', type: default, offset: 0, size: 32, alignment: 16,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -4112, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 2, name: '', type: default, offset: 0, size: 3888, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -8000, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 3, name: '', type: default, offset: 0, size: 56, alignment: 8,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -8056, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 4, name: '', type: default, offset: 0, size: 128, alignment: 16,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -8192, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+body:             |
+  bb.0 (%ir-block.1):
+    liveins: $q0, $q1, $q2, $q3, $q4, $q5, $q6, $q7, $x1, $x2, $x3, $x4, $x5, $x6, $x7
+
+    ; CHECK-LABEL: name: F76
+    ; CHECK: liveins: $q0, $q1, $q2, $q3, $q4, $q5, $q6, $q7, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $fp
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.5)
+    ; CHECK-NEXT: $sp = frame-setup SUBXri $sp, 2, 12
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 8208
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+    ; CHECK-NEXT: renamable $x8 = IRGstack $sp, $xzr
+    ; CHECK-NEXT: renamable $x9 = TAGPstack $x8, 2, renamable $x8, 1
+    ; CHECK-NEXT: renamable $x10 = COPY renamable $x9
+    ; CHECK-NEXT: dead early-clobber renamable $x11, dead early-clobber renamable $x10 = STGloop_wback 4080, killed renamable $x10, implicit-def dead $nzcv
+    ; CHECK-NEXT: ST2Gi renamable $x8, renamable $x8, 0
+    ; CHECK-NEXT: dead early-clobber $x8, early-clobber $sp = frame-destroy STGloop_wback 4096, $sp, implicit-def $nzcv
+    ; CHECK-NEXT: $sp = frame-destroy STGPostIndex $sp, $sp, 256
+    ; CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.5)
+    ; CHECK-NEXT: RET_ReallyLR
+    /* Prologue */
+    renamable $x8 = IRGstack $sp, $xzr
+    renamable $x9 = TAGPstack %stack.0, 0, renamable $x8, 1
+    renamable $x10 = COPY renamable $x9
+    dead early-clobber renamable $x11, dead early-clobber renamable $x10 = STGloop_wback 4080, killed renamable $x10, implicit-def dead $nzcv
+    ST2Gi renamable $x8, renamable $x8, 0
+
+    /* Epilogue */
+    dead early-clobber renamable $x8, dead early-clobber renamable $x9 = STGloop 4080, %stack.0, implicit-def dead $nzcv
+    ST2Gi $sp, %stack.1, 0
+    RET_ReallyLR
+...

>From 7c959b33f1f8f0ccf5d33730f0bcef5fc3a9f45d Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard at arm.com>
Date: Thu, 21 Nov 2024 11:38:27 +0000
Subject: [PATCH 2/3] [AArch64] Fix range check for STGPostIndex

When generating function epilogues using AArch64 stack tagging, we can
fold an SP update into the tag-setting loop. The loop tags 32 bytes at a
time using ST2G, so the final SP update might be done either by a post
indexed STG which tags the final 16 bytes of the tagged region, or by an
ADD/SUB instruction after the loop. However, we were only considering
the range of the ADD/SUB instructions when deciding whether to do this,
and the valid immediate range for STG is slightly lower when the offset
is positive.
---
 .../Target/AArch64/AArch64FrameLowering.cpp    | 18 ++++++++++++++----
 .../AArch64/stack-tagging-epilogue-fold.mir    |  5 +++--
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 216244950ba9ee..18acf62596c04c 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -4393,10 +4393,20 @@ bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg,
     int64_t Offset = MI.getOperand(2).getImm() << Shift;
     if (MI.getOpcode() == AArch64::SUBXri)
       Offset = -Offset;
-    int64_t AbsPostOffset = std::abs(Offset - Size);
-    const int64_t kMaxOffset =
-        0xFFF; // Max encoding for unshifted ADDXri / SUBXri
-    if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) {
+    int64_t PostOffset = Offset - Size;
+    // TagStoreEdit::emitLoop might emit either an ADD/SUB after the loop, or
+    // an STGPostIndex which does the last 16 bytes of tag write. Which one is
+    // chosen depends on the alignment of the loop size, but the difference
+    // between the valid ranges for the two instructions is small, so we
+    // conservatively assume that it could be either case here.
+    //
+    // Max offset of STGPostIndex, minus the 16 byte tag write folded into that
+    // instruction.
+    const int64_t kMaxOffset = 4080 - 16;
+    // Max offset of SUBXri.
+    const int64_t kMinOffset = -4095;
+    if (PostOffset <= kMaxOffset && PostOffset >= kMinOffset &&
+        PostOffset % 16 == 0) {
       *TotalOffset = Offset;
       return true;
     }
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-epilogue-fold.mir b/llvm/test/CodeGen/AArch64/stack-tagging-epilogue-fold.mir
index 00c0bdaabaa4af..bcd716c5c8d6c0 100644
--- a/llvm/test/CodeGen/AArch64/stack-tagging-epilogue-fold.mir
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-epilogue-fold.mir
@@ -53,8 +53,9 @@ body:             |
     ; CHECK-NEXT: renamable $x10 = COPY renamable $x9
     ; CHECK-NEXT: dead early-clobber renamable $x11, dead early-clobber renamable $x10 = STGloop_wback 4080, killed renamable $x10, implicit-def dead $nzcv
     ; CHECK-NEXT: ST2Gi renamable $x8, renamable $x8, 0
-    ; CHECK-NEXT: dead early-clobber $x8, early-clobber $sp = frame-destroy STGloop_wback 4096, $sp, implicit-def $nzcv
-    ; CHECK-NEXT: $sp = frame-destroy STGPostIndex $sp, $sp, 256
+    ; CHECK-NEXT: $x9 = ADDXri $sp, 0, 0
+    ; CHECK-NEXT: dead early-clobber $x8, dead early-clobber $x9 = STGloop_wback 4112, $x9, implicit-def $nzcv
+    ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 2, 12
     ; CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.5)
     ; CHECK-NEXT: RET_ReallyLR
     /* Prologue */

>From 9b95f600e56b283f7238fa8048d92a4cb80450db Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard at arm.com>
Date: Thu, 21 Nov 2024 11:01:38 +0000
Subject: [PATCH 3/3] [AArch64] Add assertions and debug trace (NFC)

---
 llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 18acf62596c04c..e6057d8a028849 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -4355,26 +4355,37 @@ void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
 
   int64_t ExtraBaseRegUpdate =
       FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getFixed() - Size) : 0;
+  LLVM_DEBUG(dbgs() << "TagStoreEdit::emitLoop: LoopSize=" << LoopSize
+                    << ", Size=" << Size
+                    << ", ExtraBaseRegUpdate=" << ExtraBaseRegUpdate
+                    << ", FrameRegUpdate=" << FrameRegUpdate
+                    << ", FrameRegOffset.getFixed()="
+                    << FrameRegOffset.getFixed() << "\n");
   if (LoopSize < Size) {
     assert(FrameRegUpdate);
     assert(Size - LoopSize == 16);
     // Tag 16 more bytes at BaseReg and update BaseReg.
+    int64_t STGOffset = ExtraBaseRegUpdate + 16;
+    assert(STGOffset % 16 == 0 && STGOffset >= -4096 && STGOffset <= 4080 &&
+           "STG immediate out of range");
     BuildMI(*MBB, InsertI, DL,
             TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex))
         .addDef(BaseReg)
         .addReg(BaseReg)
         .addReg(BaseReg)
-        .addImm(1 + ExtraBaseRegUpdate / 16)
+        .addImm(STGOffset / 16)
         .setMemRefs(CombinedMemRefs)
         .setMIFlags(FrameRegUpdateFlags);
   } else if (ExtraBaseRegUpdate) {
     // Update BaseReg.
+    int64_t AddSubOffset = std::abs(ExtraBaseRegUpdate);
+    assert(AddSubOffset <= 4095 && "ADD/SUB immediate out of range");
     BuildMI(
         *MBB, InsertI, DL,
         TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri))
         .addDef(BaseReg)
         .addReg(BaseReg)
-        .addImm(std::abs(ExtraBaseRegUpdate))
+        .addImm(AddSubOffset)
         .addImm(0)
         .setMIFlags(FrameRegUpdateFlags);
   }



More information about the llvm-commits mailing list