[llvm] [AArch64][SME] Vastly simplify and fix `sme-framelower-use-bp.ll` (NFC) (PR #172999)

Benjamin Maxwell via llvm-commits llvm-commits at lists.llvm.org
Fri Dec 19 04:31:06 PST 2025


https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/172999

>From 5503872bd223ea888643fa1969b47b5d51a87a14 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 19 Dec 2025 12:17:58 +0000
Subject: [PATCH 1/2] [AArch64][SME] Vastly simplify and fix
 `sme-framelower-use-bp.ll` (NFC)

This test was added in: https://github.com/llvm/llvm-project/commit/d4c86e7f3ea298b259e673142470a7b838f5f302

However, over time this test has stopped testing that change. That
change ensures that LLVM sets up the base-pointer in functions with
only +sme (no sve) and dynamic allocas + SVE stack objects.

The original test did not intend to have dynamic allocas or SVE stack
objects though. They were introduced by the IR-based SME ABI
unintentionally pushing allocas outside the entry block and SVE spills.

Both of these have been resolved, so this test was not testing the
original change. This patch simplifies the test, and corrects it so
tests the intended functionality.
---
 .../CodeGen/AArch64/sme-framelower-use-bp.ll  | 770 +-----------------
 1 file changed, 28 insertions(+), 742 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
index 28050960c1da4..862fa89e54a9b 100644
--- a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
+++ b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
@@ -3,748 +3,34 @@
 
 target triple = "aarch64-linux-gnu"
 
-declare void @llvm.trap() #0
+; This test checks that with only +sme set with set up the base-pointer (x19) in
+; functions with dynamic allocas and SVE stack objects.
 
-; This test checks that we don't assert/crash due to not being able to reach the
-; emergency spill slot by ensuring that we use a BP for streaming functions.
-
-define void @quux() #1 {
+define void @quux(i64 %n) "aarch64_new_za" "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: quux:
-; CHECK:       // %bb.0: // %bb
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    sub sp, sp, #352
-; CHECK-NEXT:    addvl sp, sp, #-21
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0b, 0x8f, 0xf0, 0x02, 0x92, 0x2e, 0x00, 0x11, 0xa8, 0x01, 0x1e, 0x22 // sp + 368 + 168 * VG
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mrs x8, TPIDR2_EL0
-; CHECK-NEXT:    cbnz x8, .LBB0_1
-; CHECK-NEXT:    b .LBB0_2
-; CHECK-NEXT:  .LBB0_1: // %bb
-; CHECK-NEXT:    bl __arm_tpidr2_save
-; CHECK-NEXT:    msr TPIDR2_EL0, xzr
-; CHECK-NEXT:    zero {za}
-; CHECK-NEXT:    b .LBB0_2
-; CHECK-NEXT:  .LBB0_2: // %bb
-; CHECK-NEXT:    smstart za
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    strb w8, [sp, #207]
-; CHECK-NEXT:    strb w8, [sp, #183]
-; CHECK-NEXT:    strb w8, [sp, #143]
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    str x8, [sp, #8] // 8-byte Spill
-; CHECK-NEXT:    str x8, [sp, #128]
-; CHECK-NEXT:    ldr x9, [sp, #192]
-; CHECK-NEXT:    ldr x10, [sp, #224]
-; CHECK-NEXT:    add x9, x9, x10, lsl #2
-; CHECK-NEXT:    ldr x10, [sp, #208]
-; CHECK-NEXT:    subs x10, x10, #1
-; CHECK-NEXT:    ldr x11, [sp, #184]
-; CHECK-NEXT:    mul x10, x10, x11
-; CHECK-NEXT:    add x9, x9, x10, lsl #2
-; CHECK-NEXT:    str x9, [sp, #128]
-; CHECK-NEXT:    str x8, [sp, #120]
-; CHECK-NEXT:    ldr x9, [sp, #168]
-; CHECK-NEXT:    ldr x10, [sp, #216]
-; CHECK-NEXT:    add x9, x9, x10, lsl #2
-; CHECK-NEXT:    ldr x10, [sp, #208]
-; CHECK-NEXT:    subs x10, x10, #1
-; CHECK-NEXT:    ldr x11, [sp, #160]
-; CHECK-NEXT:    mul x10, x10, x11
-; CHECK-NEXT:    add x9, x9, x10, lsl #2
-; CHECK-NEXT:    str x9, [sp, #120]
-; CHECK-NEXT:    str x8, [sp, #112]
-; CHECK-NEXT:    ldr x9, [sp, #152]
-; CHECK-NEXT:    ldr x10, [sp, #224]
-; CHECK-NEXT:    add x9, x9, x10, lsl #2
-; CHECK-NEXT:    ldr x10, [sp, #216]
-; CHECK-NEXT:    subs x10, x10, #1
-; CHECK-NEXT:    ldr x11, [sp, #144]
-; CHECK-NEXT:    mul x10, x10, x11
-; CHECK-NEXT:    add x9, x9, x10, lsl #2
-; CHECK-NEXT:    str x9, [sp, #112]
-; CHECK-NEXT:    mov w9, #32 // =0x20
-; CHECK-NEXT:    // kill: def $x9 killed $w9
-; CHECK-NEXT:    str x9, [sp, #104]
-; CHECK-NEXT:    str x9, [sp, #96]
-; CHECK-NEXT:    str x8, [sp, #88]
-; CHECK-NEXT:    ldr x10, [sp, #208]
-; CHECK-NEXT:    lsl x10, x10, #5
-; CHECK-NEXT:    str x10, [sp, #88]
-; CHECK-NEXT:    str x9, [sp, #184]
-; CHECK-NEXT:    str x8, [sp, #80]
-; CHECK-NEXT:    ldr x10, [sp, #224]
-; CHECK-NEXT:    subs x10, x10, #1
-; CHECK-NEXT:    lsr x10, x10, #5
-; CHECK-NEXT:    add x10, x10, #1
-; CHECK-NEXT:    str x10, [sp, #80]
-; CHECK-NEXT:    ldr x10, [sp, #192]
-; CHECK-NEXT:    ldr x11, [sp, #80]
-; CHECK-NEXT:    ldr x12, [sp, #88]
-; CHECK-NEXT:    mul x11, x11, x12
-; CHECK-NEXT:    add x10, x10, x11, lsl #2
-; CHECK-NEXT:    str x10, [sp, #128]
-; CHECK-NEXT:    str x9, [sp, #160]
-; CHECK-NEXT:    str x8, [sp, #72]
-; CHECK-NEXT:    ldr x10, [sp, #216]
-; CHECK-NEXT:    subs x10, x10, #1
-; CHECK-NEXT:    lsr x10, x10, #5
-; CHECK-NEXT:    add x10, x10, #1
-; CHECK-NEXT:    str x10, [sp, #72]
-; CHECK-NEXT:    ldr x10, [sp, #168]
-; CHECK-NEXT:    ldr x11, [sp, #72]
-; CHECK-NEXT:    ldr x12, [sp, #88]
-; CHECK-NEXT:    mul x11, x11, x12
-; CHECK-NEXT:    add x10, x10, x11, lsl #2
-; CHECK-NEXT:    str x10, [sp, #120]
-; CHECK-NEXT:    str x8, [sp, #64]
-; CHECK-NEXT:    str x8, [sp, #64]
-; CHECK-NEXT:    str x8, [sp, #56]
-; CHECK-NEXT:    str x8, [sp, #56]
-; CHECK-NEXT:    str x8, [sp, #48]
-; CHECK-NEXT:    ldr x10, [sp, #224]
-; CHECK-NEXT:    ldr x11, [sp, #56]
-; CHECK-NEXT:    subs x10, x10, x11
-; CHECK-NEXT:    str x10, [sp, #256]
-; CHECK-NEXT:    str x9, [sp, #248]
-; CHECK-NEXT:    ldr x10, [sp, #256]
-; CHECK-NEXT:    str x10, [sp, #48]
-; CHECK-NEXT:    str x8, [sp, #40]
-; CHECK-NEXT:    ldr x10, [sp, #216]
-; CHECK-NEXT:    ldr x11, [sp, #64]
-; CHECK-NEXT:    subs x10, x10, x11
-; CHECK-NEXT:    str x10, [sp, #240]
-; CHECK-NEXT:    str x9, [sp, #232]
-; CHECK-NEXT:    ldr x9, [sp, #240]
-; CHECK-NEXT:    str x9, [sp, #40]
-; CHECK-NEXT:    ldr x9, [sp, #56]
-; CHECK-NEXT:    ldr x10, [sp, #224]
-; CHECK-NEXT:    whilelt pn8.s, x9, x10, vlx2
-; CHECK-NEXT:    add x9, sp, #352
-; CHECK-NEXT:    str pn8, [x9, #7, mul vl]
-; CHECK-NEXT:    ldr x9, [sp, #64]
-; CHECK-NEXT:    ldr x10, [sp, #216]
-; CHECK-NEXT:    whilelt pn8.s, x9, x10, vlx2
-; CHECK-NEXT:    add x9, sp, #352
-; CHECK-NEXT:    str pn8, [x9, #6, mul vl]
-; CHECK-NEXT:    str x8, [sp, #32]
-; CHECK-NEXT:    ldr x9, [sp, #152]
-; CHECK-NEXT:    ldr x10, [sp, #56]
-; CHECK-NEXT:    add x9, x9, x10, lsl #2
-; CHECK-NEXT:    ldr x10, [sp, #64]
-; CHECK-NEXT:    ldr x11, [sp, #144]
-; CHECK-NEXT:    mul x10, x10, x11
-; CHECK-NEXT:    add x9, x9, x10, lsl #2
-; CHECK-NEXT:    str x9, [sp, #32]
-; CHECK-NEXT:    zero {za}
-; CHECK-NEXT:    str x8, [sp, #24]
-; CHECK-NEXT:    ldr x9, [sp, #192]
-; CHECK-NEXT:    ldr x10, [sp, #56]
-; CHECK-NEXT:    ldr x11, [sp, #208]
-; CHECK-NEXT:    mul x10, x10, x11
-; CHECK-NEXT:    add x9, x9, x10, lsl #2
-; CHECK-NEXT:    str x9, [sp, #24]
-; CHECK-NEXT:    str x8, [sp, #16]
-; CHECK-NEXT:    ldr x8, [sp, #168]
-; CHECK-NEXT:    ldr x9, [sp, #64]
-; CHECK-NEXT:    ldr x10, [sp, #208]
-; CHECK-NEXT:    mul x9, x9, x10
-; CHECK-NEXT:    add x8, x8, x9, lsl #2
-; CHECK-NEXT:    str x8, [sp, #16]
-; CHECK-NEXT:    ldr x12, [sp, #208]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr p1, [x8, #7, mul vl]
-; CHECK-NEXT:    ldr x11, [sp, #24]
-; CHECK-NEXT:    ldr x8, [sp, #184]
-; CHECK-NEXT:    lsr x10, x8, #2
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr p0, [x8, #6, mul vl]
-; CHECK-NEXT:    ldr x9, [sp, #16]
-; CHECK-NEXT:    ldr x8, [sp, #160]
-; CHECK-NEXT:    lsr x8, x8, #2
-; CHECK-NEXT:    str x12, [sp, #296]
-; CHECK-NEXT:    add x12, sp, #352
-; CHECK-NEXT:    str p1, [x12, #47, mul vl]
-; CHECK-NEXT:    str x11, [sp, #288]
-; CHECK-NEXT:    str x10, [sp, #280]
-; CHECK-NEXT:    add x10, sp, #352
-; CHECK-NEXT:    str p0, [x10, #46, mul vl]
-; CHECK-NEXT:    str x9, [sp, #272]
-; CHECK-NEXT:    str x8, [sp, #264]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr p0, [x8, #47, mul vl]
-; CHECK-NEXT:    mov p8.b, p0.b
-; CHECK-NEXT:    pext { p3.s, p4.s }, pn8[0]
-; CHECK-NEXT:    mov p0.b, p3.b
-; CHECK-NEXT:    ptrue p2.s
-; CHECK-NEXT:    and p0.b, p0/z, p0.b, p2.b
-; CHECK-NEXT:    mov p1.b, p4.b
-; CHECK-NEXT:    and p1.b, p1/z, p1.b, p2.b
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    addpl x8, x8, #31
-; CHECK-NEXT:    addpl x8, x8, #13
-; CHECK-NEXT:    incd x8
-; CHECK-NEXT:    str p1, [x8]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    str p0, [x8, #44, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr p0, [x8, #46, mul vl]
-; CHECK-NEXT:    mov p8.b, p0.b
-; CHECK-NEXT:    pext { p3.s, p4.s }, pn8[0]
-; CHECK-NEXT:    mov p0.b, p3.b
-; CHECK-NEXT:    and p0.b, p0/z, p0.b, p2.b
-; CHECK-NEXT:    mov p1.b, p4.b
-; CHECK-NEXT:    and p1.b, p1/z, p1.b, p2.b
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    addpl x8, x8, #31
-; CHECK-NEXT:    addpl x8, x8, #11
-; CHECK-NEXT:    incd x8
-; CHECK-NEXT:    str p1, [x8]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    str p0, [x8, #42, mul vl]
-; CHECK-NEXT:    b .LBB0_3
-; CHECK-NEXT:  .LBB0_3: // %bb178
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr p0, [x8, #47, mul vl]
-; CHECK-NEXT:    ldr x8, [sp, #288]
-; CHECK-NEXT:    mov p8.b, p0.b
-; CHECK-NEXT:    ld1w { z16.s, z24.s }, pn8/z, [x8]
-; CHECK-NEXT:    mov z0.d, z16.d
-; CHECK-NEXT:    mov z1.d, z24.d
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    str z1, [x8, #4, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    str z0, [x8, #3, mul vl]
-; CHECK-NEXT:    ldr x9, [sp, #280]
-; CHECK-NEXT:    ldr x8, [sp, #288]
-; CHECK-NEXT:    add x8, x8, x9, lsl #2
-; CHECK-NEXT:    str x8, [sp, #288]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr p0, [x8, #46, mul vl]
-; CHECK-NEXT:    ldr x8, [sp, #272]
-; CHECK-NEXT:    mov p8.b, p0.b
-; CHECK-NEXT:    ld1w { z16.s, z24.s }, pn8/z, [x8]
-; CHECK-NEXT:    mov z0.d, z16.d
-; CHECK-NEXT:    mov z1.d, z24.d
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    str z1, [x8, #2, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    str z0, [x8, #1, mul vl]
-; CHECK-NEXT:    ldr x9, [sp, #264]
-; CHECK-NEXT:    ldr x8, [sp, #272]
-; CHECK-NEXT:    add x8, x8, x9, lsl #2
-; CHECK-NEXT:    str x8, [sp, #272]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr p1, [x8, #44, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr p0, [x8, #42, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr z1, [x8, #3, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr z0, [x8, #1, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    str p1, [x8, #95, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    str p0, [x8, #94, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    str z1, [x8, #10, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    str z0, [x8, #9, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr p0, [x8, #95, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr p1, [x8, #94, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr z0, [x8, #10, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr z1, [x8, #9, mul vl]
-; CHECK-NEXT:    fmopa za0.s, p0/m, p1/m, z0.s, z1.s
-; CHECK-NEXT:    add x9, sp, #352
-; CHECK-NEXT:    addpl x9, x9, #31
-; CHECK-NEXT:    addpl x9, x9, #13
-; CHECK-NEXT:    incd x9
-; CHECK-NEXT:    ldr p1, [x9]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr p0, [x8, #42, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr z1, [x8, #4, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr z0, [x8, #1, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    str p1, [x8, #119, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    str p0, [x8, #118, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    str z1, [x8, #13, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    str z0, [x8, #12, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr p0, [x8, #119, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr p1, [x8, #118, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr z0, [x8, #13, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr z1, [x8, #12, mul vl]
-; CHECK-NEXT:    fmopa za1.s, p0/m, p1/m, z0.s, z1.s
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr p1, [x8, #44, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    addpl x8, x8, #31
-; CHECK-NEXT:    addpl x8, x8, #11
-; CHECK-NEXT:    incd x8
-; CHECK-NEXT:    ldr p0, [x8]
-; CHECK-NEXT:    add x10, sp, #352
-; CHECK-NEXT:    ldr z1, [x10, #3, mul vl]
-; CHECK-NEXT:    add x10, sp, #352
-; CHECK-NEXT:    ldr z0, [x10, #2, mul vl]
-; CHECK-NEXT:    add x10, sp, #352
-; CHECK-NEXT:    str p1, [x10, #143, mul vl]
-; CHECK-NEXT:    add x10, sp, #352
-; CHECK-NEXT:    str p0, [x10, #142, mul vl]
-; CHECK-NEXT:    add x10, sp, #352
-; CHECK-NEXT:    str z1, [x10, #16, mul vl]
-; CHECK-NEXT:    add x10, sp, #352
-; CHECK-NEXT:    str z0, [x10, #15, mul vl]
-; CHECK-NEXT:    add x10, sp, #352
-; CHECK-NEXT:    ldr p0, [x10, #143, mul vl]
-; CHECK-NEXT:    add x10, sp, #352
-; CHECK-NEXT:    ldr p1, [x10, #142, mul vl]
-; CHECK-NEXT:    add x10, sp, #352
-; CHECK-NEXT:    ldr z0, [x10, #16, mul vl]
-; CHECK-NEXT:    add x10, sp, #352
-; CHECK-NEXT:    ldr z1, [x10, #15, mul vl]
-; CHECK-NEXT:    fmopa za2.s, p0/m, p1/m, z0.s, z1.s
-; CHECK-NEXT:    ldr p1, [x9]
-; CHECK-NEXT:    ldr p0, [x8]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr z1, [x8, #4, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr z0, [x8, #2, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    str p1, [x8, #167, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    str p0, [x8, #166, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    str z1, [x8, #19, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    str z0, [x8, #18, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr p0, [x8, #167, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr p1, [x8, #166, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr z0, [x8, #19, mul vl]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr z1, [x8, #18, mul vl]
-; CHECK-NEXT:    fmopa za3.s, p0/m, p1/m, z0.s, z1.s
-; CHECK-NEXT:    ldr x8, [sp, #296]
-; CHECK-NEXT:    subs x8, x8, #1
-; CHECK-NEXT:    str x8, [sp, #296]
-; CHECK-NEXT:    b .LBB0_3
-bb:
-  %alloca = alloca <vscale x 16 x i1>, align 2
-  %alloca1 = alloca <vscale x 16 x i1>, align 2
-  %alloca2 = alloca <vscale x 4 x float>, align 16
-  %alloca3 = alloca <vscale x 4 x float>, align 16
-  %alloca4 = alloca <vscale x 16 x i1>, align 2
-  %alloca5 = alloca <vscale x 16 x i1>, align 2
-  %alloca6 = alloca <vscale x 4 x float>, align 16
-  %alloca7 = alloca <vscale x 4 x float>, align 16
-  %alloca8 = alloca <vscale x 16 x i1>, align 2
-  %alloca9 = alloca <vscale x 16 x i1>, align 2
-  %alloca10 = alloca <vscale x 4 x float>, align 16
-  %alloca11 = alloca <vscale x 4 x float>, align 16
-  %alloca12 = alloca <vscale x 16 x i1>, align 2
-  %alloca13 = alloca <vscale x 16 x i1>, align 2
-  %alloca14 = alloca <vscale x 4 x float>, align 16
-  %alloca15 = alloca <vscale x 4 x float>, align 16
-  %alloca16 = alloca i64, align 8
-  %alloca17 = alloca i64, align 8
-  %alloca18 = alloca ptr, align 8
-  %alloca19 = alloca i64, align 8
-  %alloca20 = alloca i64, align 8
-  %alloca21 = alloca target("aarch64.svcount"), align 2
-  %alloca22 = alloca i32, align 4
-  %alloca23 = alloca <vscale x 32 x i8>, align 16
-  %alloca24 = alloca i64, align 8
-  %alloca25 = alloca target("aarch64.svcount"), align 2
-  %alloca26 = alloca ptr, align 8
-  %alloca27 = alloca i64, align 8
-  %alloca28 = alloca target("aarch64.svcount"), align 2
-  %alloca29 = alloca ptr, align 8
-  %alloca30 = alloca i64, align 8
-  %alloca31 = alloca <vscale x 32 x i1>, align 2
-  %alloca32 = alloca <vscale x 32 x i1>, align 2
-  %alloca33 = alloca <vscale x 8 x float>, align 16
-  %alloca34 = alloca <vscale x 8 x float>, align 16
-  %alloca35 = alloca i64, align 8
-  %alloca36 = alloca i64, align 8
-  %alloca37 = alloca i64, align 8
-  %alloca38 = alloca i64, align 8
-  %alloca39 = alloca i64, align 8
-  %alloca40 = alloca i64, align 8
-  %alloca41 = alloca i64, align 8
-  %alloca42 = alloca i8, align 1
-  %alloca43 = alloca ptr, align 8
-  %alloca44 = alloca i64, align 8
-  %alloca45 = alloca i8, align 1
-  %alloca46 = alloca ptr, align 8
-  %alloca47 = alloca i64, align 8
-  %alloca48 = alloca ptr, align 8
-  %alloca49 = alloca i64, align 8
-  %alloca50 = alloca i8, align 1
-  %alloca51 = alloca ptr, align 8
-  %alloca52 = alloca ptr, align 8
-  %alloca53 = alloca ptr, align 8
-  %alloca54 = alloca i64, align 8
-  %alloca55 = alloca i64, align 8
-  %alloca56 = alloca i64, align 8
-  %alloca57 = alloca i64, align 8
-  %alloca58 = alloca i64, align 8
-  %alloca59 = alloca i64, align 8
-  %alloca60 = alloca i64, align 8
-  %alloca61 = alloca i64, align 8
-  %alloca62 = alloca i64, align 8
-  %alloca63 = alloca target("aarch64.svcount"), align 2
-  %alloca64 = alloca target("aarch64.svcount"), align 2
-  %alloca65 = alloca ptr, align 8
-  %alloca66 = alloca ptr, align 8
-  %alloca67 = alloca ptr, align 8
-  store i8 0, ptr %alloca42, align 1
-  store i8 0, ptr %alloca45, align 1
-  store i8 0, ptr %alloca50, align 1
-  store ptr null, ptr %alloca51, align 8
-  %load = load ptr, ptr %alloca43, align 8
-  %load68 = load i64, ptr %alloca39, align 8
-  %getelementptr = getelementptr inbounds float, ptr %load, i64 %load68
-  %load69 = load i64, ptr %alloca41, align 8
-  %sub = sub i64 %load69, 1
-  %load70 = load i64, ptr %alloca44, align 8
-  %mul = mul i64 %sub, %load70
-  %getelementptr71 = getelementptr inbounds float, ptr %getelementptr, i64 %mul
-  store ptr %getelementptr71, ptr %alloca51, align 8
-  store ptr null, ptr %alloca52, align 8
-  %load72 = load ptr, ptr %alloca46, align 8
-  %load73 = load i64, ptr %alloca40, align 8
-  %getelementptr74 = getelementptr inbounds float, ptr %load72, i64 %load73
-  %load75 = load i64, ptr %alloca41, align 8
-  %sub76 = sub i64 %load75, 1
-  %load77 = load i64, ptr %alloca47, align 8
-  %mul78 = mul i64 %sub76, %load77
-  %getelementptr79 = getelementptr inbounds float, ptr %getelementptr74, i64 %mul78
-  store ptr %getelementptr79, ptr %alloca52, align 8
-  store ptr null, ptr %alloca53, align 8
-  %load80 = load ptr, ptr %alloca48, align 8
-  %load81 = load i64, ptr %alloca39, align 8
-  %getelementptr82 = getelementptr inbounds float, ptr %load80, i64 %load81
-  %load83 = load i64, ptr %alloca40, align 8
-  %sub84 = sub i64 %load83, 1
-  %load85 = load i64, ptr %alloca49, align 8
-  %mul86 = mul i64 %sub84, %load85
-  %getelementptr87 = getelementptr inbounds float, ptr %getelementptr82, i64 %mul86
-  store ptr %getelementptr87, ptr %alloca53, align 8
-  store i64 32, ptr %alloca54, align 8
-  store i64 32, ptr %alloca55, align 8
-  store i64 0, ptr %alloca56, align 8
-  %load88 = load i64, ptr %alloca41, align 8
-  %mul89 = mul i64 32, %load88
-  store i64 %mul89, ptr %alloca56, align 8
-  %load90 = load i8, ptr %alloca42, align 1
-  %trunc = trunc i8 %load90 to i1
-  store i64 32, ptr %alloca44, align 8
-  store i64 0, ptr %alloca57, align 8
-  %load91 = load i64, ptr %alloca39, align 8
-  %sub92 = sub i64 %load91, 1
-  %udiv = udiv i64 %sub92, 32
-  %add = add i64 %udiv, 1
-  store i64 %add, ptr %alloca57, align 8
-  %load93 = load ptr, ptr %alloca43, align 8
-  %load94 = load i64, ptr %alloca57, align 8
-  %load95 = load i64, ptr %alloca56, align 8
-  %mul96 = mul i64 %load94, %load95
-  %getelementptr97 = getelementptr inbounds float, ptr %load93, i64 %mul96
-  store ptr %getelementptr97, ptr %alloca51, align 8
-  %load98 = load i8, ptr %alloca45, align 1
-  %trunc99 = trunc i8 %load98 to i1
-  store i64 32, ptr %alloca47, align 8
-  store i64 0, ptr %alloca58, align 8
-  %load100 = load i64, ptr %alloca40, align 8
-  %sub101 = sub i64 %load100, 1
-  %udiv102 = udiv i64 %sub101, 32
-  %add103 = add i64 %udiv102, 1
-  store i64 %add103, ptr %alloca58, align 8
-  %load104 = load ptr, ptr %alloca46, align 8
-  %load105 = load i64, ptr %alloca58, align 8
-  %load106 = load i64, ptr %alloca56, align 8
-  %mul107 = mul i64 %load105, %load106
-  %getelementptr108 = getelementptr inbounds float, ptr %load104, i64 %mul107
-  store ptr %getelementptr108, ptr %alloca52, align 8
-  store i64 0, ptr %alloca59, align 8
-  store i64 0, ptr %alloca59, align 8
-  %load109 = load i64, ptr %alloca59, align 8
-  %load110 = load i64, ptr %alloca40, align 8
-  %icmp = icmp ult i64 %load109, %load110
-  store i64 0, ptr %alloca60, align 8
-  store i64 0, ptr %alloca60, align 8
-  %load111 = load i64, ptr %alloca60, align 8
-  %load112 = load i64, ptr %alloca39, align 8
-  %icmp113 = icmp ult i64 %load111, %load112
-  store i64 0, ptr %alloca61, align 8
-  %load114 = load i64, ptr %alloca39, align 8
-  %load115 = load i64, ptr %alloca60, align 8
-  %sub116 = sub i64 %load114, %load115
-  store i64 %sub116, ptr %alloca35, align 8
-  store i64 32, ptr %alloca36, align 8
-  %load117 = load i64, ptr %alloca35, align 8
-  %load118 = load i64, ptr %alloca36, align 8
-  %icmp119 = icmp ult i64 %load117, %load118
-  %load120 = load i64, ptr %alloca35, align 8
-  store i64 %load120, ptr %alloca61, align 8
-  store i64 0, ptr %alloca62, align 8
-  %load121 = load i64, ptr %alloca40, align 8
-  %load122 = load i64, ptr %alloca59, align 8
-  %sub123 = sub i64 %load121, %load122
-  store i64 %sub123, ptr %alloca37, align 8
-  store i64 32, ptr %alloca38, align 8
-  %load124 = load i64, ptr %alloca37, align 8
-  %load125 = load i64, ptr %alloca38, align 8
-  %icmp126 = icmp ult i64 %load124, %load125
-  %load127 = load i64, ptr %alloca37, align 8
-  store i64 %load127, ptr %alloca62, align 8
-  %load128 = load i64, ptr %alloca60, align 8
-  %load129 = load i64, ptr %alloca39, align 8
-  %call = call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c32(i64 %load128, i64 %load129, i32 2)
-  store target("aarch64.svcount") %call, ptr %alloca63, align 2
-  %load130 = load i64, ptr %alloca59, align 8
-  %load131 = load i64, ptr %alloca40, align 8
-  %call132 = call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c32(i64 %load130, i64 %load131, i32 2)
-  store target("aarch64.svcount") %call132, ptr %alloca64, align 2
-  store ptr null, ptr %alloca65, align 8
-  %load133 = load ptr, ptr %alloca48, align 8
-  %load134 = load i64, ptr %alloca60, align 8
-  %getelementptr135 = getelementptr inbounds float, ptr %load133, i64 %load134
-  %load136 = load i64, ptr %alloca59, align 8
-  %load137 = load i64, ptr %alloca49, align 8
-  %mul138 = mul i64 %load136, %load137
-  %getelementptr139 = getelementptr inbounds float, ptr %getelementptr135, i64 %mul138
-  store ptr %getelementptr139, ptr %alloca65, align 8
-  call void @llvm.aarch64.sme.zero(i32 255)
-  store ptr null, ptr %alloca66, align 8
-  %load140 = load i8, ptr %alloca42, align 1
-  %trunc141 = trunc i8 %load140 to i1
-  %load142 = load ptr, ptr %alloca43, align 8
-  %load143 = load i64, ptr %alloca60, align 8
-  %load144 = load i64, ptr %alloca41, align 8
-  %mul145 = mul i64 %load143, %load144
-  %getelementptr146 = getelementptr inbounds float, ptr %load142, i64 %mul145
-  store ptr %getelementptr146, ptr %alloca66, align 8
-  store ptr null, ptr %alloca67, align 8
-  %load147 = load i8, ptr %alloca45, align 1
-  %trunc148 = trunc i8 %load147 to i1
-  %load149 = load ptr, ptr %alloca46, align 8
-  %load150 = load i64, ptr %alloca59, align 8
-  %load151 = load i64, ptr %alloca41, align 8
-  %mul152 = mul i64 %load150, %load151
-  %getelementptr153 = getelementptr inbounds float, ptr %load149, i64 %mul152
-  store ptr %getelementptr153, ptr %alloca67, align 8
-  %load154 = load i64, ptr %alloca41, align 8
-  %load155 = load target("aarch64.svcount"), ptr %alloca63, align 2
-  %load156 = load ptr, ptr %alloca66, align 8
-  %load157 = load i64, ptr %alloca44, align 8
-  %udiv158 = udiv i64 %load157, 4
-  %load159 = load target("aarch64.svcount"), ptr %alloca64, align 2
-  %load160 = load ptr, ptr %alloca67, align 8
-  %load161 = load i64, ptr %alloca47, align 8
-  %udiv162 = udiv i64 %load161, 4
-  store i64 %load154, ptr %alloca24, align 8
-  store target("aarch64.svcount") %load155, ptr %alloca25, align 2
-  store ptr %load156, ptr %alloca26, align 8
-  store i64 %udiv158, ptr %alloca27, align 8
-  store target("aarch64.svcount") %load159, ptr %alloca28, align 2
-  store ptr %load160, ptr %alloca29, align 8
-  store i64 %udiv162, ptr %alloca30, align 8
-  %load163 = load target("aarch64.svcount"), ptr %alloca25, align 2
-  %call164 = call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.pext.x2.nxv4i1(target("aarch64.svcount") %load163, i32 0)
-  %extractvalue = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } %call164, 0
-  %call165 = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %extractvalue)
-  %call166 = call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> %call165, i64 0)
-  %extractvalue167 = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } %call164, 1
-  %call168 = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %extractvalue167)
-  %call169 = call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> %call166, <vscale x 16 x i1> %call168, i64 16)
-  store <vscale x 32 x i1> %call169, ptr %alloca31, align 2
-  %load170 = load target("aarch64.svcount"), ptr %alloca28, align 2
-  %call171 = call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.pext.x2.nxv4i1(target("aarch64.svcount") %load170, i32 0)
-  %extractvalue172 = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } %call171, 0
-  %call173 = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %extractvalue172)
-  %call174 = call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> %call173, i64 0)
-  %extractvalue175 = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } %call171, 1
-  %call176 = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %extractvalue175)
-  %call177 = call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> %call174, <vscale x 16 x i1> %call176, i64 16)
-  store <vscale x 32 x i1> %call177, ptr %alloca32, align 2
-  br label %bb178
-
-bb178:                                            ; preds = %bb178, %bb
-  %load179 = load i64, ptr %alloca24, align 8
-  %icmp180 = icmp ugt i64 %load179, 0
-  %load181 = load target("aarch64.svcount"), ptr %alloca25, align 2
-  %load182 = load ptr, ptr %alloca26, align 8
-  %call183 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") %load181, ptr %load182)
-  %extractvalue184 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %call183, 0
-  %call185 = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> %extractvalue184, i64 0)
-  %extractvalue186 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %call183, 1
-  %call187 = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> %call185, <vscale x 4 x float> %extractvalue186, i64 4)
-  store <vscale x 8 x float> %call187, ptr %alloca33, align 16
-  %load188 = load i64, ptr %alloca27, align 8
-  %load189 = load ptr, ptr %alloca26, align 8
-  %getelementptr190 = getelementptr inbounds float, ptr %load189, i64 %load188
-  store ptr %getelementptr190, ptr %alloca26, align 8
-  %load191 = load target("aarch64.svcount"), ptr %alloca28, align 2
-  %load192 = load ptr, ptr %alloca29, align 8
-  %call193 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") %load191, ptr %load192)
-  %extractvalue194 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %call193, 0
-  %call195 = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> %extractvalue194, i64 0)
-  %extractvalue196 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %call193, 1
-  %call197 = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> %call195, <vscale x 4 x float> %extractvalue196, i64 4)
-  store <vscale x 8 x float> %call197, ptr %alloca34, align 16
-  %load198 = load i64, ptr %alloca30, align 8
-  %load199 = load ptr, ptr %alloca29, align 8
-  %getelementptr200 = getelementptr inbounds float, ptr %load199, i64 %load198
-  store ptr %getelementptr200, ptr %alloca29, align 8
-  %load201 = load <vscale x 32 x i1>, ptr %alloca31, align 2
-  %call202 = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %load201, i64 0)
-  %load203 = load <vscale x 32 x i1>, ptr %alloca32, align 2
-  %call204 = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %load203, i64 0)
-  %load205 = load <vscale x 8 x float>, ptr %alloca33, align 16
-  %call206 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> %load205, i64 0)
-  %load207 = load <vscale x 8 x float>, ptr %alloca34, align 16
-  %call208 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> %load207, i64 0)
-  store <vscale x 16 x i1> %call202, ptr %alloca12, align 2
-  store <vscale x 16 x i1> %call204, ptr %alloca13, align 2
-  store <vscale x 4 x float> %call206, ptr %alloca14, align 16
-  store <vscale x 4 x float> %call208, ptr %alloca15, align 16
-  %load209 = load <vscale x 16 x i1>, ptr %alloca12, align 2
-  %load210 = load <vscale x 16 x i1>, ptr %alloca13, align 2
-  %load211 = load <vscale x 4 x float>, ptr %alloca14, align 16
-  %load212 = load <vscale x 4 x float>, ptr %alloca15, align 16
-  %call213 = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %load209)
-  %call214 = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %load210)
-  call void @llvm.aarch64.sme.mopa.nxv4f32(i32 0, <vscale x 4 x i1> %call213, <vscale x 4 x i1> %call214, <vscale x 4 x float> %load211, <vscale x 4 x float> %load212)
-  %load215 = load <vscale x 32 x i1>, ptr %alloca31, align 2
-  %call216 = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %load215, i64 16)
-  %load217 = load <vscale x 32 x i1>, ptr %alloca32, align 2
-  %call218 = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %load217, i64 0)
-  %load219 = load <vscale x 8 x float>, ptr %alloca33, align 16
-  %call220 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> %load219, i64 4)
-  %load221 = load <vscale x 8 x float>, ptr %alloca34, align 16
-  %call222 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> %load221, i64 0)
-  store <vscale x 16 x i1> %call216, ptr %alloca8, align 2
-  store <vscale x 16 x i1> %call218, ptr %alloca9, align 2
-  store <vscale x 4 x float> %call220, ptr %alloca10, align 16
-  store <vscale x 4 x float> %call222, ptr %alloca11, align 16
-  %load223 = load <vscale x 16 x i1>, ptr %alloca8, align 2
-  %load224 = load <vscale x 16 x i1>, ptr %alloca9, align 2
-  %load225 = load <vscale x 4 x float>, ptr %alloca10, align 16
-  %load226 = load <vscale x 4 x float>, ptr %alloca11, align 16
-  %call227 = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %load223)
-  %call228 = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %load224)
-  call void @llvm.aarch64.sme.mopa.nxv4f32(i32 1, <vscale x 4 x i1> %call227, <vscale x 4 x i1> %call228, <vscale x 4 x float> %load225, <vscale x 4 x float> %load226)
-  %load229 = load <vscale x 32 x i1>, ptr %alloca31, align 2
-  %call230 = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %load229, i64 0)
-  %load231 = load <vscale x 32 x i1>, ptr %alloca32, align 2
-  %call232 = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %load231, i64 16)
-  %load233 = load <vscale x 8 x float>, ptr %alloca33, align 16
-  %call234 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> %load233, i64 0)
-  %load235 = load <vscale x 8 x float>, ptr %alloca34, align 16
-  %call236 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> %load235, i64 4)
-  store <vscale x 16 x i1> %call230, ptr %alloca4, align 2
-  store <vscale x 16 x i1> %call232, ptr %alloca5, align 2
-  store <vscale x 4 x float> %call234, ptr %alloca6, align 16
-  store <vscale x 4 x float> %call236, ptr %alloca7, align 16
-  %load237 = load <vscale x 16 x i1>, ptr %alloca4, align 2
-  %load238 = load <vscale x 16 x i1>, ptr %alloca5, align 2
-  %load239 = load <vscale x 4 x float>, ptr %alloca6, align 16
-  %load240 = load <vscale x 4 x float>, ptr %alloca7, align 16
-  %call241 = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %load237)
-  %call242 = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %load238)
-  call void @llvm.aarch64.sme.mopa.nxv4f32(i32 2, <vscale x 4 x i1> %call241, <vscale x 4 x i1> %call242, <vscale x 4 x float> %load239, <vscale x 4 x float> %load240)
-  %load243 = load <vscale x 32 x i1>, ptr %alloca31, align 2
-  %call244 = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %load243, i64 16)
-  %load245 = load <vscale x 32 x i1>, ptr %alloca32, align 2
-  %call246 = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %load245, i64 16)
-  %load247 = load <vscale x 8 x float>, ptr %alloca33, align 16
-  %call248 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> %load247, i64 4)
-  %load249 = load <vscale x 8 x float>, ptr %alloca34, align 16
-  %call250 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> %load249, i64 4)
-  store <vscale x 16 x i1> %call244, ptr %alloca, align 2
-  store <vscale x 16 x i1> %call246, ptr %alloca1, align 2
-  store <vscale x 4 x float> %call248, ptr %alloca2, align 16
-  store <vscale x 4 x float> %call250, ptr %alloca3, align 16
-  %load251 = load <vscale x 16 x i1>, ptr %alloca, align 2
-  %load252 = load <vscale x 16 x i1>, ptr %alloca1, align 2
-  %load253 = load <vscale x 4 x float>, ptr %alloca2, align 16
-  %load254 = load <vscale x 4 x float>, ptr %alloca3, align 16
-  %call255 = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %load251)
-  %call256 = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %load252)
-  call void @llvm.aarch64.sme.mopa.nxv4f32(i32 3, <vscale x 4 x i1> %call255, <vscale x 4 x i1> %call256, <vscale x 4 x float> %load253, <vscale x 4 x float> %load254)
-  %load257 = load i64, ptr %alloca24, align 8
-  %add258 = add i64 %load257, -1
-  store i64 %add258, ptr %alloca24, align 8
-  br label %bb178
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    add x8, x0, #15
+; CHECK-NEXT:    and x9, x8, #0xfffffffffffffff0
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    subs x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    // fake_use: $x8
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %zpr_local = alloca <vscale x 16 x i8>
+  %dyn_alloc = alloca i8, i64 %n, align 1
+  call void (...) @llvm.fake.use(ptr %dyn_alloc)
+  call void (...) @llvm.fake.use(ptr %zpr_local)
+  ret void
 }
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-declare target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c32(i64, i64, i32 immarg) #2
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn
-declare void @llvm.aarch64.sme.zero(i32 immarg) #3
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-declare { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.pext.x2.nxv4i1(target("aarch64.svcount"), i32 immarg) #2
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1>) #2
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1>, <vscale x 16 x i1>, i64 immarg) #4
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
-declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount"), ptr) #5
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float>, <vscale x 4 x float>, i64 immarg) #4
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1>, i64 immarg) #4
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float>, i64 immarg) #4
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-declare <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1>) #2
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn
-declare void @llvm.aarch64.sme.mopa.nxv4f32(i32 immarg, <vscale x 4 x i1>, <vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>) #3
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-declare target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c8(i64, i64, i32 immarg) #2
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn
-declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32, i32) #3
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8>, <vscale x 16 x i8>, i64 immarg) #4
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8>, i64 immarg) #4
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write)
-declare void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, target("aarch64.svcount"), ptr) #6
-
-attributes #0 = { cold noreturn nounwind }
-attributes #1 = { mustprogress noinline optnone ssp uwtable(sync) vscale_range(1,16) "aarch64_new_za" "aarch64_pstate_sm_enabled" "frame-pointer"="non-leaf" "target-features"="+fp-armv8,+fullfp16,+sme,+sme-f64f64,+sme2" }
-attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) }
-attributes #3 = { nocallback nofree nosync nounwind willreturn }
-attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
-attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }

>From 8243098ac28570ee8f40072c26bbfd7105559d86 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 19 Dec 2025 12:30:04 +0000
Subject: [PATCH 2/2] o typo

---
 llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
index 862fa89e54a9b..301eee43d33e6 100644
--- a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
+++ b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
@@ -3,8 +3,8 @@
 
 target triple = "aarch64-linux-gnu"
 
-; This test checks that with only +sme set with set up the base-pointer (x19) in
-; functions with dynamic allocas and SVE stack objects.
+; This test checks that with only +sme (no SVE), LLVM sets up the base-pointer
+; (x19) in functions with dynamic allocas and SVE stack objects.
 
 define void @quux(i64 %n) "aarch64_new_za" "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: quux:



More information about the llvm-commits mailing list