[llvm] [AArch64][SME] Fix restoring callee-saves from FP with hazard padding (PR #143371)
Eli Friedman via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 9 13:17:21 PDT 2025
================
@@ -3143,3 +3143,428 @@ entry:
call void @bar(ptr noundef nonnull %b)
ret i32 0
}
+
+
+define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" {
+; CHECK0-LABEL: svecc_call_dynamic_alloca:
+; CHECK0: // %bb.0: // %entry
+; CHECK0-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK0-NEXT: .cfi_def_cfa_offset 64
+; CHECK0-NEXT: cntd x9
+; CHECK0-NEXT: stp x27, x26, [sp, #32] // 16-byte Folded Spill
+; CHECK0-NEXT: stp x9, x28, [sp, #16] // 16-byte Folded Spill
+; CHECK0-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK0-NEXT: mov x29, sp
+; CHECK0-NEXT: .cfi_def_cfa w29, 64
+; CHECK0-NEXT: .cfi_offset w19, -8
+; CHECK0-NEXT: .cfi_offset w20, -16
+; CHECK0-NEXT: .cfi_offset w26, -24
+; CHECK0-NEXT: .cfi_offset w27, -32
+; CHECK0-NEXT: .cfi_offset w28, -40
+; CHECK0-NEXT: .cfi_offset w30, -56
+; CHECK0-NEXT: .cfi_offset w29, -64
+; CHECK0-NEXT: addvl sp, sp, #-18
+; CHECK0-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 64 - 8 * VG
+; CHECK0-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 64 - 16 * VG
+; CHECK0-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 64 - 24 * VG
+; CHECK0-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 64 - 32 * VG
+; CHECK0-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 64 - 40 * VG
+; CHECK0-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 64 - 48 * VG
+; CHECK0-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 64 - 56 * VG
+; CHECK0-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 64 - 64 * VG
+; CHECK0-NEXT: mov w9, w0
+; CHECK0-NEXT: mov x8, sp
+; CHECK0-NEXT: mov w2, w1
+; CHECK0-NEXT: add x9, x9, #15
+; CHECK0-NEXT: mov x19, sp
+; CHECK0-NEXT: and x9, x9, #0x1fffffff0
+; CHECK0-NEXT: sub x8, x8, x9
+; CHECK0-NEXT: mov sp, x8
+; CHECK0-NEXT: //APP
+; CHECK0-NEXT: //NO_APP
+; CHECK0-NEXT: bl __arm_sme_state
+; CHECK0-NEXT: and x20, x0, #0x1
+; CHECK0-NEXT: .cfi_offset vg, -48
+; CHECK0-NEXT: tbz w20, #0, .LBB35_2
+; CHECK0-NEXT: // %bb.1: // %entry
+; CHECK0-NEXT: smstop sm
+; CHECK0-NEXT: .LBB35_2: // %entry
+; CHECK0-NEXT: mov x0, x8
+; CHECK0-NEXT: mov w1, #45 // =0x2d
+; CHECK0-NEXT: bl memset
+; CHECK0-NEXT: tbz w20, #0, .LBB35_4
+; CHECK0-NEXT: // %bb.3: // %entry
+; CHECK0-NEXT: smstart sm
+; CHECK0-NEXT: .LBB35_4: // %entry
+; CHECK0-NEXT: mov w0, #22647 // =0x5877
+; CHECK0-NEXT: movk w0, #59491, lsl #16
+; CHECK0-NEXT: .cfi_restore vg
+; CHECK0-NEXT: addvl sp, x29, #-18
+; CHECK0-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT: .cfi_restore z8
+; CHECK0-NEXT: .cfi_restore z9
+; CHECK0-NEXT: .cfi_restore z10
+; CHECK0-NEXT: .cfi_restore z11
+; CHECK0-NEXT: .cfi_restore z12
+; CHECK0-NEXT: .cfi_restore z13
+; CHECK0-NEXT: .cfi_restore z14
+; CHECK0-NEXT: .cfi_restore z15
+; CHECK0-NEXT: mov sp, x29
+; CHECK0-NEXT: .cfi_def_cfa wsp, 64
+; CHECK0-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK0-NEXT: ldr x28, [sp, #24] // 8-byte Folded Reload
+; CHECK0-NEXT: ldp x27, x26, [sp, #32] // 16-byte Folded Reload
+; CHECK0-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK0-NEXT: .cfi_def_cfa_offset 0
+; CHECK0-NEXT: .cfi_restore w19
+; CHECK0-NEXT: .cfi_restore w20
+; CHECK0-NEXT: .cfi_restore w26
+; CHECK0-NEXT: .cfi_restore w27
+; CHECK0-NEXT: .cfi_restore w28
+; CHECK0-NEXT: .cfi_restore w30
+; CHECK0-NEXT: .cfi_restore w29
+; CHECK0-NEXT: ret
+;
+; CHECK64-LABEL: svecc_call_dynamic_alloca:
+; CHECK64: // %bb.0: // %entry
+; CHECK64-NEXT: sub sp, sp, #128
+; CHECK64-NEXT: .cfi_def_cfa_offset 128
+; CHECK64-NEXT: cntd x9
+; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK64-NEXT: stp x9, x28, [sp, #80] // 16-byte Folded Spill
+; CHECK64-NEXT: stp x27, x26, [sp, #96] // 16-byte Folded Spill
+; CHECK64-NEXT: stp x20, x19, [sp, #112] // 16-byte Folded Spill
+; CHECK64-NEXT: add x29, sp, #64
+; CHECK64-NEXT: .cfi_def_cfa w29, 64
+; CHECK64-NEXT: .cfi_offset w19, -8
+; CHECK64-NEXT: .cfi_offset w20, -16
+; CHECK64-NEXT: .cfi_offset w26, -24
+; CHECK64-NEXT: .cfi_offset w27, -32
+; CHECK64-NEXT: .cfi_offset w28, -40
+; CHECK64-NEXT: .cfi_offset w30, -56
+; CHECK64-NEXT: .cfi_offset w29, -64
+; CHECK64-NEXT: addvl sp, sp, #-18
+; CHECK64-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 128 - 8 * VG
+; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 128 - 16 * VG
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 128 - 24 * VG
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 128 - 32 * VG
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 128 - 40 * VG
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 128 - 48 * VG
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 128 - 56 * VG
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 128 - 64 * VG
+; CHECK64-NEXT: sub sp, sp, #64
+; CHECK64-NEXT: mov w9, w0
+; CHECK64-NEXT: mov x8, sp
+; CHECK64-NEXT: mov w2, w1
+; CHECK64-NEXT: add x9, x9, #15
+; CHECK64-NEXT: mov x19, sp
+; CHECK64-NEXT: and x9, x9, #0x1fffffff0
+; CHECK64-NEXT: sub x8, x8, x9
+; CHECK64-NEXT: mov sp, x8
+; CHECK64-NEXT: //APP
+; CHECK64-NEXT: //NO_APP
+; CHECK64-NEXT: bl __arm_sme_state
+; CHECK64-NEXT: and x20, x0, #0x1
+; CHECK64-NEXT: .cfi_offset vg, -48
+; CHECK64-NEXT: tbz w20, #0, .LBB35_2
+; CHECK64-NEXT: // %bb.1: // %entry
+; CHECK64-NEXT: smstop sm
+; CHECK64-NEXT: .LBB35_2: // %entry
+; CHECK64-NEXT: mov x0, x8
+; CHECK64-NEXT: mov w1, #45 // =0x2d
+; CHECK64-NEXT: bl memset
+; CHECK64-NEXT: tbz w20, #0, .LBB35_4
+; CHECK64-NEXT: // %bb.3: // %entry
+; CHECK64-NEXT: smstart sm
+; CHECK64-NEXT: .LBB35_4: // %entry
+; CHECK64-NEXT: mov w0, #22647 // =0x5877
+; CHECK64-NEXT: movk w0, #59491, lsl #16
+; CHECK64-NEXT: .cfi_restore vg
+; CHECK64-NEXT: sub sp, x29, #64
+; CHECK64-NEXT: addvl sp, sp, #-18
----------------
efriedma-quic wrote:
Modifying SP this way introduces a small window for memory corruption: if you take an interrupt between the sub and the addvl, the callee-saves can get overwritten. I think you need a temporary register here.
https://github.com/llvm/llvm-project/pull/143371
More information about the llvm-commits
mailing list