[clang] [llvm] [AArch64] Stack probing for function prologues (PR #66524)
Momchil Velikov via cfe-commits
cfe-commits at lists.llvm.org
Thu Nov 23 06:24:18 PST 2023
================
@@ -0,0 +1,722 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s
+
+; Test prolog sequences for stack probing when SVE objects are involved.
+
+; The space for SVE objects needs probing in the general case, because
+; the stack adjustment may happen to be too big (i.e. greater than the
+; probe size) to allocate with a single `addvl`.
+; When we do know that the stack adjustment cannot exceed the probe size
+; we can avoid emitting a probe loop and emit a simple `addvl; str`
+; sequence instead.
+
+define void @sve_1_vector(ptr %out) #0 {
+; CHECK-LABEL: sve_1_vector:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: ret
+entry:
+ %vec = alloca <vscale x 4 x float>, align 16
+ ret void
+}
+
+; As above, but with 4 SVE vectors of stack space.
+define void @sve_4_vector(ptr %out) #0 {
+; CHECK-LABEL: sve_4_vector:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: addvl sp, sp, #-4
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
+; CHECK-NEXT: addvl sp, sp, #4
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: ret
+entry:
+ %vec1 = alloca <vscale x 4 x float>, align 16
+ %vec2 = alloca <vscale x 4 x float>, align 16
+ %vec3 = alloca <vscale x 4 x float>, align 16
+ %vec4 = alloca <vscale x 4 x float>, align 16
+ ret void
+}
+
+; As above, but with 16 SVE vectors of stack space.
+; The stack adjustment is less than or equal to 16 x 256 = 4096, so
+; we can allocate the locals at once.
+define void @sve_16_vector(ptr %out) #0 {
+; CHECK-LABEL: sve_16_vector:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: addvl sp, sp, #-16
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: addvl sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: ret
+entry:
+ %vec1 = alloca <vscale x 4 x float>, align 16
+ %vec2 = alloca <vscale x 4 x float>, align 16
+ %vec3 = alloca <vscale x 4 x float>, align 16
+ %vec4 = alloca <vscale x 4 x float>, align 16
+ %vec5 = alloca <vscale x 4 x float>, align 16
+ %vec6 = alloca <vscale x 4 x float>, align 16
+ %vec7 = alloca <vscale x 4 x float>, align 16
+ %vec8 = alloca <vscale x 4 x float>, align 16
+ %vec9 = alloca <vscale x 4 x float>, align 16
+ %vec10 = alloca <vscale x 4 x float>, align 16
+ %vec11 = alloca <vscale x 4 x float>, align 16
+ %vec12 = alloca <vscale x 4 x float>, align 16
+ %vec13 = alloca <vscale x 4 x float>, align 16
+ %vec14 = alloca <vscale x 4 x float>, align 16
+ %vec15 = alloca <vscale x 4 x float>, align 16
+ %vec16 = alloca <vscale x 4 x float>, align 16
+ ret void
+}
+
+; As above, but with 17 SVE vectors of stack space. Now we need
+; a probing loops since stack adjustment may be greater than
+; the probe size (17 x 256 = 4354 bytes)
----------------
momchil-velikov wrote:
Done
https://github.com/llvm/llvm-project/pull/66524
More information about the cfe-commits
mailing list