[llvm] [AArch64] Fix frame-pointer offset with hazard padding (PR #118091)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 29 05:23:44 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Benjamin Maxwell (MacDue)
<details>
<summary>Changes</summary>
The `-aarch64-stack-hazard-size=<val>` option disables register paring (as the hazard padding may mean the offset is too large for STP/LDP).
This broke setting the frame-pointer offset, as the code to find the frame record looked for a (FP, LR) register pair.
This patch resolves this by looking for FP, LR as two unpaired registers when hazard padding is enabled.
---
Full diff: https://github.com/llvm/llvm-project/pull/118091.diff
2 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64FrameLowering.cpp (+13-3)
- (modified) llvm/test/CodeGen/AArch64/stack-hazard.ll (+131-26)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index d6673969aa3056..d593738cf32414 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -3167,11 +3167,21 @@ static void computeCalleeSaveRegisterPairs(
(RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) &&
"Offset out of bounds for LDP/STP immediate");
+ auto isFrameRecord = [&] {
+ if (RPI.isPaired())
+ return IsWindows ? RPI.Reg1 == AArch64::FP && RPI.Reg2 == AArch64::LR
+ : RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP;
+ // -aarch64-stack-hazard-size=<val> disables register pairing, so look
+ // for the frame record as two unpaired registers.
+ if (AFI->hasStackHazardSlotIndex())
+ return i > 0 && RPI.Reg1 == AArch64::FP &&
+ CSI[i - 1].getReg() == AArch64::LR;
+ return false;
+ };
+
// Save the offset to frame record so that the FP register can point to the
// innermost frame record (spilled FP and LR registers).
- if (NeedsFrameRecord &&
- ((!IsWindows && RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
- (IsWindows && RPI.Reg1 == AArch64::FP && RPI.Reg2 == AArch64::LR)))
+ if (NeedsFrameRecord && isFrameRecord())
AFI->setCalleeSaveBaseToFrameRecordOffset(Offset);
RegPairs.push_back(RPI);
diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll
index 50a2e41f45756d..a4c2b30566a951 100644
--- a/llvm/test/CodeGen/AArch64/stack-hazard.ll
+++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll
@@ -337,19 +337,18 @@ define i32 @csr_d8_allocd_framepointer(double %d) "aarch64_pstate_sm_compatible"
; CHECK64-LABEL: csr_d8_allocd_framepointer:
; CHECK64: // %bb.0: // %entry
; CHECK64-NEXT: sub sp, sp, #176
-; CHECK64-NEXT: str d8, [sp, #80] // 8-byte Folded Spill
+; CHECK64-NEXT: stp d0, d8, [sp, #72] // 8-byte Folded Spill
; CHECK64-NEXT: stp x29, x30, [sp, #152] // 16-byte Folded Spill
-; CHECK64-NEXT: add x29, sp, #80
-; CHECK64-NEXT: .cfi_def_cfa w29, 96
+; CHECK64-NEXT: add x29, sp, #152
+; CHECK64-NEXT: .cfi_def_cfa w29, 24
; CHECK64-NEXT: .cfi_offset w30, -16
; CHECK64-NEXT: .cfi_offset w29, -24
; CHECK64-NEXT: .cfi_offset b8, -96
; CHECK64-NEXT: //APP
; CHECK64-NEXT: //NO_APP
-; CHECK64-NEXT: stur d0, [x29, #-8]
; CHECK64-NEXT: ldr x29, [sp, #152] // 8-byte Folded Reload
-; CHECK64-NEXT: ldr d8, [sp, #80] // 8-byte Folded Reload
; CHECK64-NEXT: mov w0, wzr
+; CHECK64-NEXT: ldr d8, [sp, #80] // 8-byte Folded Reload
; CHECK64-NEXT: add sp, sp, #176
; CHECK64-NEXT: ret
;
@@ -358,17 +357,17 @@ define i32 @csr_d8_allocd_framepointer(double %d) "aarch64_pstate_sm_compatible"
; CHECK1024-NEXT: sub sp, sp, #1056
; CHECK1024-NEXT: str d8, [sp] // 8-byte Folded Spill
; CHECK1024-NEXT: str x29, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT: mov x29, sp
+; CHECK1024-NEXT: add x29, sp, #1032
; CHECK1024-NEXT: str x30, [sp, #1040] // 8-byte Folded Spill
; CHECK1024-NEXT: sub sp, sp, #1040
-; CHECK1024-NEXT: .cfi_def_cfa w29, 1056
+; CHECK1024-NEXT: .cfi_def_cfa w29, 24
; CHECK1024-NEXT: .cfi_offset w30, -16
; CHECK1024-NEXT: .cfi_offset w29, -24
; CHECK1024-NEXT: .cfi_offset b8, -1056
; CHECK1024-NEXT: mov w0, wzr
; CHECK1024-NEXT: //APP
; CHECK1024-NEXT: //NO_APP
-; CHECK1024-NEXT: stur d0, [x29, #-8]
+; CHECK1024-NEXT: str d0, [sp, #1032]
; CHECK1024-NEXT: add sp, sp, #1040
; CHECK1024-NEXT: ldr x30, [sp, #1040] // 8-byte Folded Reload
; CHECK1024-NEXT: ldr x29, [sp, #1032] // 8-byte Folded Reload
@@ -2893,8 +2892,8 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
; CHECK64-NEXT: stp x29, x30, [sp, #128] // 16-byte Folded Spill
; CHECK64-NEXT: stp x9, x20, [sp, #144] // 16-byte Folded Spill
; CHECK64-NEXT: str x19, [sp, #160] // 8-byte Folded Spill
-; CHECK64-NEXT: mov x29, sp
-; CHECK64-NEXT: .cfi_def_cfa w29, 176
+; CHECK64-NEXT: add x29, sp, #128
+; CHECK64-NEXT: .cfi_def_cfa w29, 48
; CHECK64-NEXT: .cfi_offset w19, -16
; CHECK64-NEXT: .cfi_offset w20, -24
; CHECK64-NEXT: .cfi_offset w30, -40
@@ -2913,11 +2912,11 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
; CHECK64-NEXT: mov w20, w0
; CHECK64-NEXT: msub x9, x8, x8, x9
; CHECK64-NEXT: mov sp, x9
-; CHECK64-NEXT: stur x9, [x29, #-80]
-; CHECK64-NEXT: sub x9, x29, #80
-; CHECK64-NEXT: sturh wzr, [x29, #-70]
-; CHECK64-NEXT: stur wzr, [x29, #-68]
-; CHECK64-NEXT: sturh w8, [x29, #-72]
+; CHECK64-NEXT: stur x9, [x29, #-208]
+; CHECK64-NEXT: sub x9, x29, #208
+; CHECK64-NEXT: sturh wzr, [x29, #-198]
+; CHECK64-NEXT: stur wzr, [x29, #-196]
+; CHECK64-NEXT: sturh w8, [x29, #-200]
; CHECK64-NEXT: msr TPIDR2_EL0, x9
; CHECK64-NEXT: .cfi_offset vg, -32
; CHECK64-NEXT: smstop sm
@@ -2926,14 +2925,14 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
; CHECK64-NEXT: .cfi_restore vg
; CHECK64-NEXT: smstart za
; CHECK64-NEXT: mrs x8, TPIDR2_EL0
-; CHECK64-NEXT: sub x0, x29, #80
+; CHECK64-NEXT: sub x0, x29, #208
; CHECK64-NEXT: cbnz x8, .LBB33_2
; CHECK64-NEXT: // %bb.1: // %entry
; CHECK64-NEXT: bl __arm_tpidr2_restore
; CHECK64-NEXT: .LBB33_2: // %entry
; CHECK64-NEXT: mov w0, w20
; CHECK64-NEXT: msr TPIDR2_EL0, xzr
-; CHECK64-NEXT: mov sp, x29
+; CHECK64-NEXT: sub sp, x29, #128
; CHECK64-NEXT: .cfi_def_cfa wsp, 176
; CHECK64-NEXT: ldp x20, x19, [sp, #152] // 16-byte Folded Reload
; CHECK64-NEXT: ldr d14, [sp, #8] // 8-byte Folded Reload
@@ -2972,8 +2971,8 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
; CHECK1024-NEXT: str x28, [sp, #1112] // 8-byte Folded Spill
; CHECK1024-NEXT: str x20, [sp, #1120] // 8-byte Folded Spill
; CHECK1024-NEXT: str x19, [sp, #1128] // 8-byte Folded Spill
-; CHECK1024-NEXT: mov x29, sp
-; CHECK1024-NEXT: .cfi_def_cfa w29, 1136
+; CHECK1024-NEXT: add x29, sp, #1088
+; CHECK1024-NEXT: .cfi_def_cfa w29, 48
; CHECK1024-NEXT: .cfi_offset w19, -8
; CHECK1024-NEXT: .cfi_offset w20, -16
; CHECK1024-NEXT: .cfi_offset w28, -24
@@ -2993,14 +2992,14 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
; CHECK1024-NEXT: mov w20, w0
; CHECK1024-NEXT: msub x9, x8, x8, x9
; CHECK1024-NEXT: mov sp, x9
-; CHECK1024-NEXT: sub x10, x29, #784
+; CHECK1024-NEXT: sub x10, x29, #1872
; CHECK1024-NEXT: stur x9, [x10, #-256]
-; CHECK1024-NEXT: sub x9, x29, #774
-; CHECK1024-NEXT: sub x10, x29, #772
+; CHECK1024-NEXT: sub x9, x29, #1862
+; CHECK1024-NEXT: sub x10, x29, #1860
; CHECK1024-NEXT: sturh wzr, [x9, #-256]
-; CHECK1024-NEXT: sub x9, x29, #1040
+; CHECK1024-NEXT: sub x9, x29, #2128
; CHECK1024-NEXT: stur wzr, [x10, #-256]
-; CHECK1024-NEXT: sub x10, x29, #776
+; CHECK1024-NEXT: sub x10, x29, #1864
; CHECK1024-NEXT: sturh w8, [x10, #-256]
; CHECK1024-NEXT: msr TPIDR2_EL0, x9
; CHECK1024-NEXT: .cfi_offset vg, -32
@@ -3010,14 +3009,14 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
; CHECK1024-NEXT: .cfi_restore vg
; CHECK1024-NEXT: smstart za
; CHECK1024-NEXT: mrs x8, TPIDR2_EL0
-; CHECK1024-NEXT: sub x0, x29, #1040
+; CHECK1024-NEXT: sub x0, x29, #2128
; CHECK1024-NEXT: cbnz x8, .LBB33_2
; CHECK1024-NEXT: // %bb.1: // %entry
; CHECK1024-NEXT: bl __arm_tpidr2_restore
; CHECK1024-NEXT: .LBB33_2: // %entry
; CHECK1024-NEXT: mov w0, w20
; CHECK1024-NEXT: msr TPIDR2_EL0, xzr
-; CHECK1024-NEXT: mov sp, x29
+; CHECK1024-NEXT: sub sp, x29, #1088
; CHECK1024-NEXT: .cfi_def_cfa wsp, 1136
; CHECK1024-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK1024-NEXT: ldr x19, [sp, #1128] // 8-byte Folded Reload
@@ -3049,3 +3048,109 @@ entry:
ret i32 %x
}
declare void @other()
+
+declare void @bar(ptr noundef) "aarch64_pstate_sm_compatible"
+
+define i32 @sve_stack_object_and_vla(double %d, i64 %sz) "aarch64_pstate_sm_compatible" "frame-pointer"="all" {
+; CHECK0-LABEL: sve_stack_object_and_vla:
+; CHECK0: // %bb.0: // %entry
+; CHECK0-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK0-NEXT: stp x28, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK0-NEXT: mov x29, sp
+; CHECK0-NEXT: addvl sp, sp, #-1
+; CHECK0-NEXT: mov x19, sp
+; CHECK0-NEXT: .cfi_def_cfa w29, 32
+; CHECK0-NEXT: .cfi_offset w19, -8
+; CHECK0-NEXT: .cfi_offset w28, -16
+; CHECK0-NEXT: .cfi_offset w30, -24
+; CHECK0-NEXT: .cfi_offset w29, -32
+; CHECK0-NEXT: lsl x9, x0, #2
+; CHECK0-NEXT: mov x8, sp
+; CHECK0-NEXT: add x9, x9, #15
+; CHECK0-NEXT: and x9, x9, #0xfffffffffffffff0
+; CHECK0-NEXT: sub x0, x8, x9
+; CHECK0-NEXT: mov sp, x0
+; CHECK0-NEXT: mov z0.s, #0 // =0x0
+; CHECK0-NEXT: ptrue p0.s
+; CHECK0-NEXT: st1w { z0.s }, p0, [x29, #-1, mul vl]
+; CHECK0-NEXT: bl bar
+; CHECK0-NEXT: mov w0, wzr
+; CHECK0-NEXT: mov sp, x29
+; CHECK0-NEXT: ldp x28, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK0-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK0-NEXT: ret
+;
+; CHECK64-LABEL: sve_stack_object_and_vla:
+; CHECK64: // %bb.0: // %entry
+; CHECK64-NEXT: sub sp, sp, #96
+; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK64-NEXT: add x29, sp, #64
+; CHECK64-NEXT: stp x28, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK64-NEXT: sub sp, sp, #64
+; CHECK64-NEXT: addvl sp, sp, #-1
+; CHECK64-NEXT: mov x19, sp
+; CHECK64-NEXT: .cfi_def_cfa w29, 32
+; CHECK64-NEXT: .cfi_offset w19, -8
+; CHECK64-NEXT: .cfi_offset w28, -16
+; CHECK64-NEXT: .cfi_offset w30, -24
+; CHECK64-NEXT: .cfi_offset w29, -32
+; CHECK64-NEXT: lsl x9, x0, #2
+; CHECK64-NEXT: mov x8, sp
+; CHECK64-NEXT: add x9, x9, #15
+; CHECK64-NEXT: and x9, x9, #0xfffffffffffffff0
+; CHECK64-NEXT: sub x0, x8, x9
+; CHECK64-NEXT: mov sp, x0
+; CHECK64-NEXT: mov z0.s, #0 // =0x0
+; CHECK64-NEXT: ptrue p0.s
+; CHECK64-NEXT: sub x8, x29, #64
+; CHECK64-NEXT: st1w { z0.s }, p0, [x8, #-1, mul vl]
+; CHECK64-NEXT: bl bar
+; CHECK64-NEXT: mov w0, wzr
+; CHECK64-NEXT: sub sp, x29, #64
+; CHECK64-NEXT: ldp x28, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK64-NEXT: add sp, sp, #96
+; CHECK64-NEXT: ret
+;
+; CHECK1024-LABEL: sve_stack_object_and_vla:
+; CHECK1024: // %bb.0: // %entry
+; CHECK1024-NEXT: sub sp, sp, #1056
+; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NEXT: add x29, sp, #1024
+; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NEXT: str x28, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NEXT: str x19, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NEXT: sub sp, sp, #1024
+; CHECK1024-NEXT: addvl sp, sp, #-1
+; CHECK1024-NEXT: mov x19, sp
+; CHECK1024-NEXT: .cfi_def_cfa w29, 32
+; CHECK1024-NEXT: .cfi_offset w19, -8
+; CHECK1024-NEXT: .cfi_offset w28, -16
+; CHECK1024-NEXT: .cfi_offset w30, -24
+; CHECK1024-NEXT: .cfi_offset w29, -32
+; CHECK1024-NEXT: lsl x9, x0, #2
+; CHECK1024-NEXT: mov x8, sp
+; CHECK1024-NEXT: add x9, x9, #15
+; CHECK1024-NEXT: and x9, x9, #0xfffffffffffffff0
+; CHECK1024-NEXT: sub x0, x8, x9
+; CHECK1024-NEXT: mov sp, x0
+; CHECK1024-NEXT: mov z0.s, #0 // =0x0
+; CHECK1024-NEXT: ptrue p0.s
+; CHECK1024-NEXT: sub x8, x29, #1024
+; CHECK1024-NEXT: st1w { z0.s }, p0, [x8, #-1, mul vl]
+; CHECK1024-NEXT: bl bar
+; CHECK1024-NEXT: mov w0, wzr
+; CHECK1024-NEXT: sub sp, x29, #1024
+; CHECK1024-NEXT: ldr x19, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NEXT: ldr x28, [sp, #1040] // 8-byte Folded Reload
+; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NEXT: add sp, sp, #1056
+; CHECK1024-NEXT: ret
+entry:
+ %a = alloca <vscale x 4 x i32>
+ %b = alloca i32, i64 %sz, align 4
+ store <vscale x 4 x i32> zeroinitializer, ptr %a
+ call void @bar(ptr noundef nonnull %b)
+ ret i32 0
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/118091
More information about the llvm-commits
mailing list