[llvm] [AArch64][SME] Fix frame lowering not using a base pointer for SME functions. (PR #91643)
Amara Emerson via llvm-commits
llvm-commits at lists.llvm.org
Fri May 10 10:46:55 PDT 2024
https://github.com/aemerson updated https://github.com/llvm/llvm-project/pull/91643
>From 089b89d35bea628eae88c1478a32e4765fce1e2a Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara at apple.com>
Date: Fri, 10 May 2024 10:45:03 -0700
Subject: [PATCH 1/2] Precommit test
---
.../CodeGen/AArch64/sme-framelower-use-bp.ll | 1077 +++++++++++++++++
1 file changed, 1077 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
diff --git a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
new file mode 100644
index 0000000000000..7050f80f46e0b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
@@ -0,0 +1,1077 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 -mattr=+sme < %s | FileCheck %s
+
+target triple = "aarch64-linux-gnu"
+
+declare void @llvm.trap() #0
+
+define void @quux() #1 {
+; CHECK-LABEL: quux:
+; CHECK: // %bb.0: // %prelude
+; CHECK-NEXT: stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #384
+; CHECK-NEXT: .cfi_def_cfa w29, 96
+; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: .cfi_offset w20, -16
+; CHECK-NEXT: .cfi_offset w21, -24
+; CHECK-NEXT: .cfi_offset w22, -32
+; CHECK-NEXT: .cfi_offset w23, -40
+; CHECK-NEXT: .cfi_offset w24, -48
+; CHECK-NEXT: .cfi_offset w25, -56
+; CHECK-NEXT: .cfi_offset w26, -64
+; CHECK-NEXT: .cfi_offset w27, -72
+; CHECK-NEXT: .cfi_offset w28, -80
+; CHECK-NEXT: .cfi_offset w30, -88
+; CHECK-NEXT: .cfi_offset w29, -96
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: msub x8, x8, x8, x9
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: stur x8, [x29, #-32]
+; CHECK-NEXT: sturh wzr, [x29, #-22]
+; CHECK-NEXT: stur wzr, [x29, #-20]
+; CHECK-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-NEXT: cbz x8, .LBB0_2
+; CHECK-NEXT: b .LBB0_1
+; CHECK-NEXT: .LBB0_1: // %save.za
+; CHECK-NEXT: bl __arm_tpidr2_save
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: b .LBB0_2
+; CHECK-NEXT: .LBB0_2: // %bb
+; CHECK-NEXT: smstart za
+; CHECK-NEXT: zero {za}
+; CHECK-NEXT: mov w23, #15 // =0xf
+; CHECK-NEXT: mov w8, #15 // =0xf
+; CHECK-NEXT: incd x8
+; CHECK-NEXT: // kill: def $w8 killed $w8 killed $x8 def $x8
+; CHECK-NEXT: and x28, x8, #0x70
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: subs x8, x8, x28
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: subs x9, x9, x28
+; CHECK-NEXT: mov sp, x9
+; CHECK-NEXT: addvl x10, x23, #1
+; CHECK-NEXT: // kill: def $w10 killed $w10 killed $x10 def $x10
+; CHECK-NEXT: and x4, x10, #0x3f0
+; CHECK-NEXT: mov x10, sp
+; CHECK-NEXT: subs x10, x10, x4
+; CHECK-NEXT: mov sp, x10
+; CHECK-NEXT: mov x11, sp
+; CHECK-NEXT: subs x11, x11, x4
+; CHECK-NEXT: mov sp, x11
+; CHECK-NEXT: mov x12, sp
+; CHECK-NEXT: subs x12, x12, x28
+; CHECK-NEXT: mov sp, x12
+; CHECK-NEXT: mov x13, sp
+; CHECK-NEXT: subs x13, x13, x28
+; CHECK-NEXT: mov sp, x13
+; CHECK-NEXT: mov x14, sp
+; CHECK-NEXT: subs x14, x14, x4
+; CHECK-NEXT: mov sp, x14
+; CHECK-NEXT: mov x15, sp
+; CHECK-NEXT: subs x15, x15, x4
+; CHECK-NEXT: mov sp, x15
+; CHECK-NEXT: mov x16, sp
+; CHECK-NEXT: subs x16, x16, x28
+; CHECK-NEXT: mov sp, x16
+; CHECK-NEXT: mov x17, sp
+; CHECK-NEXT: subs x17, x17, x28
+; CHECK-NEXT: mov sp, x17
+; CHECK-NEXT: mov x18, sp
+; CHECK-NEXT: subs x18, x18, x4
+; CHECK-NEXT: mov sp, x18
+; CHECK-NEXT: mov x0, sp
+; CHECK-NEXT: subs x0, x0, x4
+; CHECK-NEXT: mov sp, x0
+; CHECK-NEXT: mov x1, sp
+; CHECK-NEXT: subs x1, x1, x28
+; CHECK-NEXT: mov sp, x1
+; CHECK-NEXT: mov x2, sp
+; CHECK-NEXT: subs x2, x2, x28
+; CHECK-NEXT: mov sp, x2
+; CHECK-NEXT: mov x3, sp
+; CHECK-NEXT: subs x3, x3, x4
+; CHECK-NEXT: mov sp, x3
+; CHECK-NEXT: mov x5, sp
+; CHECK-NEXT: subs x4, x5, x4
+; CHECK-NEXT: mov sp, x4
+; CHECK-NEXT: subs x5, sp, #16
+; CHECK-NEXT: mov sp, x5
+; CHECK-NEXT: subs x5, sp, #16
+; CHECK-NEXT: mov sp, x5
+; CHECK-NEXT: subs x5, sp, #16
+; CHECK-NEXT: mov sp, x5
+; CHECK-NEXT: subs x5, sp, #16
+; CHECK-NEXT: mov sp, x5
+; CHECK-NEXT: subs x5, sp, #16
+; CHECK-NEXT: mov sp, x5
+; CHECK-NEXT: mov x5, sp
+; CHECK-NEXT: subs x5, x5, x28
+; CHECK-NEXT: mov sp, x5
+; CHECK-NEXT: subs x5, sp, #16
+; CHECK-NEXT: mov sp, x5
+; CHECK-NEXT: addvl x5, x23, #2
+; CHECK-NEXT: // kill: def $w5 killed $w5 killed $x5 def $x5
+; CHECK-NEXT: and x26, x5, #0x7f0
+; CHECK-NEXT: mov x5, sp
+; CHECK-NEXT: subs x5, x5, x26
+; CHECK-NEXT: and sp, x5, #0xffffffffffffffe0
+; CHECK-NEXT: mov x5, sp
+; CHECK-NEXT: stur x5, [x29, #-40] // 8-byte Folded Spill
+; CHECK-NEXT: subs x5, x5, #16
+; CHECK-NEXT: mov sp, x5
+; CHECK-NEXT: mov x6, sp
+; CHECK-NEXT: subs x6, x6, x28
+; CHECK-NEXT: mov sp, x6
+; CHECK-NEXT: mov x7, sp
+; CHECK-NEXT: stur x7, [x29, #-48] // 8-byte Folded Spill
+; CHECK-NEXT: subs x7, x7, #16
+; CHECK-NEXT: mov sp, x7
+; CHECK-NEXT: mov x19, sp
+; CHECK-NEXT: stur x19, [x29, #-56] // 8-byte Folded Spill
+; CHECK-NEXT: subs x19, x19, #16
+; CHECK-NEXT: mov sp, x19
+; CHECK-NEXT: mov x20, sp
+; CHECK-NEXT: subs x20, x20, x28
+; CHECK-NEXT: mov sp, x20
+; CHECK-NEXT: mov x21, sp
+; CHECK-NEXT: stur x21, [x29, #-64] // 8-byte Folded Spill
+; CHECK-NEXT: subs x21, x21, #16
+; CHECK-NEXT: mov sp, x21
+; CHECK-NEXT: mov x22, sp
+; CHECK-NEXT: stur x22, [x29, #-72] // 8-byte Folded Spill
+; CHECK-NEXT: subs x22, x22, #16
+; CHECK-NEXT: mov sp, x22
+; CHECK-NEXT: incw x23
+; CHECK-NEXT: // kill: def $w23 killed $w23 killed $x23 def $x23
+; CHECK-NEXT: and x24, x23, #0xf0
+; CHECK-NEXT: mov x23, sp
+; CHECK-NEXT: subs x23, x23, x24
+; CHECK-NEXT: mov sp, x23
+; CHECK-NEXT: mov x25, sp
+; CHECK-NEXT: subs x24, x25, x24
+; CHECK-NEXT: mov sp, x24
+; CHECK-NEXT: mov x25, sp
+; CHECK-NEXT: subs x25, x25, x26
+; CHECK-NEXT: and x25, x25, #0xffffffffffffffe0
+; CHECK-NEXT: mov sp, x25
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: subs x26, x27, x26
+; CHECK-NEXT: and x26, x26, #0xffffffffffffffe0
+; CHECK-NEXT: mov sp, x26
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: stur x27, [x29, #-96] // 8-byte Folded Spill
+; CHECK-NEXT: subs x27, x27, #16
+; CHECK-NEXT: mov sp, x27
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: stur x27, [x29, #-112] // 8-byte Folded Spill
+; CHECK-NEXT: subs x27, x27, #16
+; CHECK-NEXT: mov sp, x27
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: stur x27, [x29, #-104] // 8-byte Folded Spill
+; CHECK-NEXT: subs x27, x27, #16
+; CHECK-NEXT: mov sp, x27
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: stur x27, [x29, #-120] // 8-byte Folded Spill
+; CHECK-NEXT: subs x27, x27, #16
+; CHECK-NEXT: mov sp, x27
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: stur x27, [x29, #-160] // 8-byte Folded Spill
+; CHECK-NEXT: subs x27, x27, #16
+; CHECK-NEXT: mov sp, x27
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: stur x27, [x29, #-152] // 8-byte Folded Spill
+; CHECK-NEXT: subs x27, x27, #16
+; CHECK-NEXT: mov sp, x27
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: stur x27, [x29, #-80] // 8-byte Folded Spill
+; CHECK-NEXT: subs x27, x27, #16
+; CHECK-NEXT: mov sp, x27
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: stur x27, [x29, #-224] // 8-byte Folded Spill
+; CHECK-NEXT: subs x27, x27, #16
+; CHECK-NEXT: mov sp, x27
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: stur x27, [x29, #-128] // 8-byte Folded Spill
+; CHECK-NEXT: subs x27, x27, #16
+; CHECK-NEXT: mov sp, x27
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: stur x27, [x29, #-88] // 8-byte Folded Spill
+; CHECK-NEXT: subs x27, x27, #16
+; CHECK-NEXT: mov sp, x27
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: stur x27, [x29, #-248] // 8-byte Folded Spill
+; CHECK-NEXT: subs x27, x27, #16
+; CHECK-NEXT: mov sp, x27
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: stur x27, [x29, #-144] // 8-byte Folded Spill
+; CHECK-NEXT: subs x27, x27, #16
+; CHECK-NEXT: mov sp, x27
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: stur x27, [x29, #-136] // 8-byte Folded Spill
+; CHECK-NEXT: subs x27, x27, #16
+; CHECK-NEXT: mov sp, x27
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: stur x27, [x29, #-168] // 8-byte Folded Spill
+; CHECK-NEXT: subs x27, x27, #16
+; CHECK-NEXT: mov sp, x27
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: stur x27, [x29, #-176] // 8-byte Folded Spill
+; CHECK-NEXT: subs x27, x27, #16
+; CHECK-NEXT: mov sp, x27
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: sub x30, x29, #80
+; CHECK-NEXT: stur x27, [x30, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: subs x27, x27, #16
+; CHECK-NEXT: mov sp, x27
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: stur x27, [x29, #-208] // 8-byte Folded Spill
+; CHECK-NEXT: subs x27, x27, #16
+; CHECK-NEXT: mov sp, x27
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: stur x27, [x29, #-184] // 8-byte Folded Spill
+; CHECK-NEXT: subs x27, x27, #16
+; CHECK-NEXT: mov sp, x27
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: sub x30, x29, #32
+; CHECK-NEXT: stur x27, [x30, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: subs x27, x27, #16
+; CHECK-NEXT: mov sp, x27
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: sub x30, x29, #40
+; CHECK-NEXT: stur x27, [x30, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: subs x27, x27, #16
+; CHECK-NEXT: mov sp, x27
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: sub x30, x29, #56
+; CHECK-NEXT: stur x27, [x30, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: subs x27, x27, #16
+; CHECK-NEXT: mov sp, x27
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: sub x30, x29, #64
+; CHECK-NEXT: stur x27, [x30, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: subs x27, x27, #16
+; CHECK-NEXT: mov sp, x27
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: sub x30, x29, #72
+; CHECK-NEXT: stur x27, [x30, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: subs x27, x27, #16
+; CHECK-NEXT: mov sp, x27
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: sub x30, x29, #48
+; CHECK-NEXT: stur x27, [x30, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: subs x27, x27, #16
+; CHECK-NEXT: mov sp, x27
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: sub x30, x29, #8
+; CHECK-NEXT: stur x27, [x30, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: subs x27, x27, #16
+; CHECK-NEXT: mov sp, x27
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: sub x30, x29, #16
+; CHECK-NEXT: stur x27, [x30, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: subs x30, x27, #16
+; CHECK-NEXT: mov sp, x30
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: stur x27, [x29, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: subs x30, x27, #16
+; CHECK-NEXT: mov sp, x30
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: sub x30, x29, #24
+; CHECK-NEXT: stur x27, [x30, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: subs x30, x27, #16
+; CHECK-NEXT: mov sp, x30
+; CHECK-NEXT: mov x30, sp
+; CHECK-NEXT: subs x27, x30, x28
+; CHECK-NEXT: stur x27, [x29, #-192] // 8-byte Folded Spill
+; CHECK-NEXT: mov sp, x27
+; CHECK-NEXT: mov x30, sp
+; CHECK-NEXT: subs x27, x30, x28
+; CHECK-NEXT: stur x27, [x29, #-200] // 8-byte Folded Spill
+; CHECK-NEXT: mov sp, x27
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: stur x27, [x29, #-240] // 8-byte Folded Spill
+; CHECK-NEXT: subs x30, x27, #16
+; CHECK-NEXT: mov sp, x30
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: stur x27, [x29, #-216] // 8-byte Folded Spill
+; CHECK-NEXT: subs x30, x27, #16
+; CHECK-NEXT: mov sp, x30
+; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: stur x27, [x29, #-232] // 8-byte Folded Spill
+; CHECK-NEXT: subs x30, x27, #16
+; CHECK-NEXT: mov sp, x30
+; CHECK-NEXT: ldur x27, [x29, #-224] // 8-byte Folded Reload
+; CHECK-NEXT: sturb wzr, [x27, #-16]
+; CHECK-NEXT: ldur x27, [x29, #-248] // 8-byte Folded Reload
+; CHECK-NEXT: sturb wzr, [x27, #-16]
+; CHECK-NEXT: sub x27, x29, #80
+; CHECK-NEXT: ldur x27, [x27, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: sturb wzr, [x27, #-16]
+; CHECK-NEXT: ldur x27, [x29, #-208] // 8-byte Folded Reload
+; CHECK-NEXT: stur xzr, [x27, #-16]
+; CHECK-NEXT: ldur x27, [x29, #-128] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x30, [x27, #-16]
+; CHECK-NEXT: ldur x27, [x29, #-160] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x27, #-16]
+; CHECK-NEXT: add x28, x30, x28, lsl #2
+; CHECK-NEXT: ldur x27, [x29, #-80] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x30, [x27, #-16]
+; CHECK-NEXT: subs x30, x30, #1
+; CHECK-NEXT: ldur x27, [x29, #-88] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x27, [x27, #-16]
+; CHECK-NEXT: mul x27, x30, x27
+; CHECK-NEXT: add x27, x28, x27, lsl #2
+; CHECK-NEXT: ldur x28, [x29, #-208] // 8-byte Folded Reload
+; CHECK-NEXT: stur x27, [x28, #-16]
+; CHECK-NEXT: ldur x27, [x29, #-184] // 8-byte Folded Reload
+; CHECK-NEXT: stur xzr, [x27, #-16]
+; CHECK-NEXT: ldur x27, [x29, #-144] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x27, [x27, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-152] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x28, #-16]
+; CHECK-NEXT: add x27, x27, x28, lsl #2
+; CHECK-NEXT: ldur x28, [x29, #-80] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x28, #-16]
+; CHECK-NEXT: subs x28, x28, #1
+; CHECK-NEXT: ldur x30, [x29, #-136] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x30, [x30, #-16]
+; CHECK-NEXT: mul x28, x28, x30
+; CHECK-NEXT: add x27, x27, x28, lsl #2
+; CHECK-NEXT: ldur x28, [x29, #-184] // 8-byte Folded Reload
+; CHECK-NEXT: stur x27, [x28, #-16]
+; CHECK-NEXT: sub x27, x29, #32
+; CHECK-NEXT: ldur x27, [x27, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur xzr, [x27, #-16]
+; CHECK-NEXT: ldur x27, [x29, #-168] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x27, [x27, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-160] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x28, #-16]
+; CHECK-NEXT: add x27, x27, x28, lsl #2
+; CHECK-NEXT: ldur x28, [x29, #-152] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x28, #-16]
+; CHECK-NEXT: subs x28, x28, #1
+; CHECK-NEXT: ldur x30, [x29, #-176] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x30, [x30, #-16]
+; CHECK-NEXT: mul x28, x28, x30
+; CHECK-NEXT: add x27, x27, x28, lsl #2
+; CHECK-NEXT: sub x28, x29, #32
+; CHECK-NEXT: ldur x28, [x28, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur x27, [x28, #-16]
+; CHECK-NEXT: mov w28, #32 // =0x20
+; CHECK-NEXT: sub x27, x29, #40
+; CHECK-NEXT: ldur x27, [x27, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur x28, [x27, #-16]
+; CHECK-NEXT: sub x27, x29, #56
+; CHECK-NEXT: ldur x27, [x27, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur x28, [x27, #-16]
+; CHECK-NEXT: sub x27, x29, #64
+; CHECK-NEXT: ldur x30, [x27, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur xzr, [x30, #-16]
+; CHECK-NEXT: ldur x27, [x29, #-80] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x27, #-16]
+; CHECK-NEXT: lsl x28, x28, #5
+; CHECK-NEXT: stur x28, [x30, #-16]
+; CHECK-NEXT: ldur x27, [x29, #-88] // 8-byte Folded Reload
+; CHECK-NEXT: mov w28, #32 // =0x20
+; CHECK-NEXT: stur x28, [x27, #-16]
+; CHECK-NEXT: sub x27, x29, #72
+; CHECK-NEXT: ldur x30, [x27, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur xzr, [x30, #-16]
+; CHECK-NEXT: ldur x27, [x29, #-160] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x27, #-16]
+; CHECK-NEXT: subs x28, x28, #1
+; CHECK-NEXT: lsr x28, x28, #5
+; CHECK-NEXT: add x28, x28, #1
+; CHECK-NEXT: stur x28, [x30, #-16]
+; CHECK-NEXT: ldur x27, [x29, #-128] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x27, [x27, #-16]
+; CHECK-NEXT: sub x28, x29, #120
+; CHECK-NEXT: stur x27, [x28, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: ldur x28, [x30, #-16]
+; CHECK-NEXT: sub x27, x29, #64
+; CHECK-NEXT: ldur x30, [x27, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x27, [x30, #-16]
+; CHECK-NEXT: mul x27, x28, x27
+; CHECK-NEXT: sub x28, x29, #120
+; CHECK-NEXT: ldur x28, [x28, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: add x27, x28, x27, lsl #2
+; CHECK-NEXT: ldur x28, [x29, #-208] // 8-byte Folded Reload
+; CHECK-NEXT: stur x27, [x28, #-16]
+; CHECK-NEXT: ldur x27, [x29, #-136] // 8-byte Folded Reload
+; CHECK-NEXT: mov w28, #32 // =0x20
+; CHECK-NEXT: stur x28, [x27, #-16]
+; CHECK-NEXT: sub x27, x29, #48
+; CHECK-NEXT: ldur x28, [x27, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur xzr, [x28, #-16]
+; CHECK-NEXT: ldur x27, [x29, #-152] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x27, [x27, #-16]
+; CHECK-NEXT: subs x27, x27, #1
+; CHECK-NEXT: lsr x27, x27, #5
+; CHECK-NEXT: add x27, x27, #1
+; CHECK-NEXT: stur x27, [x28, #-16]
+; CHECK-NEXT: ldur x27, [x29, #-144] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x27, [x27, #-16]
+; CHECK-NEXT: ldur x28, [x28, #-16]
+; CHECK-NEXT: ldur x30, [x30, #-16]
+; CHECK-NEXT: mul x28, x28, x30
+; CHECK-NEXT: add x27, x27, x28, lsl #2
+; CHECK-NEXT: ldur x28, [x29, #-184] // 8-byte Folded Reload
+; CHECK-NEXT: stur x27, [x28, #-16]
+; CHECK-NEXT: sub x27, x29, #8
+; CHECK-NEXT: ldur x27, [x27, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur xzr, [x27, #-16]
+; CHECK-NEXT: stur xzr, [x27, #-16]
+; CHECK-NEXT: sub x27, x29, #16
+; CHECK-NEXT: ldur x28, [x27, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur xzr, [x28, #-16]
+; CHECK-NEXT: stur xzr, [x28, #-16]
+; CHECK-NEXT: ldur x27, [x29, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur xzr, [x27, #-16]
+; CHECK-NEXT: ldur x27, [x29, #-160] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x27, [x27, #-16]
+; CHECK-NEXT: ldur x28, [x28, #-16]
+; CHECK-NEXT: subs x27, x27, x28
+; CHECK-NEXT: ldur x28, [x29, #-96] // 8-byte Folded Reload
+; CHECK-NEXT: stur x27, [x28, #-16]
+; CHECK-NEXT: ldur x27, [x29, #-112] // 8-byte Folded Reload
+; CHECK-NEXT: mov w30, #32 // =0x20
+; CHECK-NEXT: stur x30, [x27, #-16]
+; CHECK-NEXT: ldur x27, [x28, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur x27, [x28, #-16]
+; CHECK-NEXT: sub x27, x29, #24
+; CHECK-NEXT: ldur x27, [x27, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur xzr, [x27, #-16]
+; CHECK-NEXT: ldur x27, [x29, #-152] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x27, [x27, #-16]
+; CHECK-NEXT: sub x28, x29, #8
+; CHECK-NEXT: ldur x28, [x28, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x28, #-16]
+; CHECK-NEXT: subs x27, x27, x28
+; CHECK-NEXT: ldur x28, [x29, #-104] // 8-byte Folded Reload
+; CHECK-NEXT: stur x27, [x28, #-16]
+; CHECK-NEXT: ldur x27, [x29, #-120] // 8-byte Folded Reload
+; CHECK-NEXT: mov w30, #32 // =0x20
+; CHECK-NEXT: stur x30, [x27, #-16]
+; CHECK-NEXT: ldur x27, [x28, #-16]
+; CHECK-NEXT: sub x28, x29, #24
+; CHECK-NEXT: ldur x28, [x28, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur x27, [x28, #-16]
+; CHECK-NEXT: sub x27, x29, #16
+; CHECK-NEXT: ldur x27, [x27, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x27, [x27, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-160] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x28, #-16]
+; CHECK-NEXT: whilelt pn8.s, x27, x28, vlx2
+; CHECK-NEXT: ldur x27, [x29, #-192] // 8-byte Folded Reload
+; CHECK-NEXT: str pn8, [x27]
+; CHECK-NEXT: sub x27, x29, #8
+; CHECK-NEXT: ldur x30, [x27, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x27, [x30, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-152] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x28, #-16]
+; CHECK-NEXT: whilelt pn8.s, x27, x28, vlx2
+; CHECK-NEXT: ldur x27, [x29, #-200] // 8-byte Folded Reload
+; CHECK-NEXT: str pn8, [x27]
+; CHECK-NEXT: ldur x27, [x29, #-240] // 8-byte Folded Reload
+; CHECK-NEXT: stur xzr, [x27, #-16]
+; CHECK-NEXT: ldur x27, [x29, #-168] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x27, [x27, #-16]
+; CHECK-NEXT: sub x28, x29, #16
+; CHECK-NEXT: ldur x28, [x28, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x28, #-16]
+; CHECK-NEXT: add x27, x27, x28, lsl #2
+; CHECK-NEXT: ldur x28, [x30, #-16]
+; CHECK-NEXT: ldur x30, [x29, #-176] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x30, [x30, #-16]
+; CHECK-NEXT: mul x28, x28, x30
+; CHECK-NEXT: add x27, x27, x28, lsl #2
+; CHECK-NEXT: ldur x28, [x29, #-240] // 8-byte Folded Reload
+; CHECK-NEXT: stur x27, [x28, #-16]
+; CHECK-NEXT: zero {za}
+; CHECK-NEXT: ldur x27, [x29, #-216] // 8-byte Folded Reload
+; CHECK-NEXT: stur xzr, [x27, #-16]
+; CHECK-NEXT: ldur x27, [x29, #-128] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x27, [x27, #-16]
+; CHECK-NEXT: sub x28, x29, #16
+; CHECK-NEXT: ldur x28, [x28, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x28, #-16]
+; CHECK-NEXT: ldur x30, [x29, #-80] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x30, [x30, #-16]
+; CHECK-NEXT: mul x28, x28, x30
+; CHECK-NEXT: add x27, x27, x28, lsl #2
+; CHECK-NEXT: ldur x28, [x29, #-216] // 8-byte Folded Reload
+; CHECK-NEXT: stur x27, [x28, #-16]
+; CHECK-NEXT: ldur x27, [x29, #-232] // 8-byte Folded Reload
+; CHECK-NEXT: stur xzr, [x27, #-16]
+; CHECK-NEXT: ldur x27, [x29, #-144] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x27, [x27, #-16]
+; CHECK-NEXT: sub x28, x29, #8
+; CHECK-NEXT: ldur x28, [x28, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x28, #-16]
+; CHECK-NEXT: ldur x30, [x29, #-80] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x30, [x30, #-16]
+; CHECK-NEXT: mul x28, x28, x30
+; CHECK-NEXT: add x27, x27, x28, lsl #2
+; CHECK-NEXT: ldur x28, [x29, #-232] // 8-byte Folded Reload
+; CHECK-NEXT: stur x27, [x28, #-16]
+; CHECK-NEXT: ldur x27, [x29, #-80] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x27, [x27, #-16]
+; CHECK-NEXT: sub x30, x29, #88
+; CHECK-NEXT: stur x27, [x30, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: ldur x27, [x29, #-192] // 8-byte Folded Reload
+; CHECK-NEXT: ldr p0, [x27]
+; CHECK-NEXT: ldur x27, [x29, #-216] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x27, [x27, #-16]
+; CHECK-NEXT: sub x30, x29, #96
+; CHECK-NEXT: stur x27, [x30, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: ldur x27, [x29, #-88] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x30, [x27, #-16]
+; CHECK-NEXT: lsr x27, x30, #2
+; CHECK-NEXT: sub x30, x29, #104
+; CHECK-NEXT: stur x27, [x30, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: ldur x27, [x29, #-200] // 8-byte Folded Reload
+; CHECK-NEXT: ldr p1, [x27]
+; CHECK-NEXT: ldur x27, [x28, #-16]
+; CHECK-NEXT: sub x28, x29, #112
+; CHECK-NEXT: stur x27, [x28, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: ldur x27, [x29, #-136] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x27, #-16]
+; CHECK-NEXT: lsr x28, x28, #2
+; CHECK-NEXT: ldur x27, [x29, #-40] // 8-byte Folded Reload
+; CHECK-NEXT: sub x30, x29, #88
+; CHECK-NEXT: ldur x30, [x30, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur x30, [x27, #-16]
+; CHECK-NEXT: str p0, [x6]
+; CHECK-NEXT: ldur x27, [x29, #-48] // 8-byte Folded Reload
+; CHECK-NEXT: sub x30, x29, #96
+; CHECK-NEXT: ldur x30, [x30, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur x30, [x27, #-16]
+; CHECK-NEXT: ldur x27, [x29, #-56] // 8-byte Folded Reload
+; CHECK-NEXT: sub x30, x29, #104
+; CHECK-NEXT: ldur x30, [x30, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur x30, [x27, #-16]
+; CHECK-NEXT: str p1, [x20]
+; CHECK-NEXT: ldur x27, [x29, #-64] // 8-byte Folded Reload
+; CHECK-NEXT: sub x30, x29, #112
+; CHECK-NEXT: ldur x30, [x30, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur x30, [x27, #-16]
+; CHECK-NEXT: ldur x27, [x29, #-72] // 8-byte Folded Reload
+; CHECK-NEXT: stur x28, [x27, #-16]
+; CHECK-NEXT: ldr p0, [x6]
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: pext { p0.s, p1.s }, pn8[0]
+; CHECK-NEXT: ptrue p2.s
+; CHECK-NEXT: and p3.b, p0/z, p0.b, p2.b
+; CHECK-NEXT: and p0.b, p1/z, p1.b, p2.b
+; CHECK-NEXT: mov x27, x23
+; CHECK-NEXT: incd x27
+; CHECK-NEXT: str p0, [x27]
+; CHECK-NEXT: str p3, [x23]
+; CHECK-NEXT: ldr p0, [x20]
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: pext { p0.s, p1.s }, pn8[0]
+; CHECK-NEXT: and p3.b, p0/z, p0.b, p2.b
+; CHECK-NEXT: and p0.b, p1/z, p1.b, p2.b
+; CHECK-NEXT: mov x27, x24
+; CHECK-NEXT: incd x27
+; CHECK-NEXT: str p0, [x27]
+; CHECK-NEXT: str p3, [x24]
+; CHECK-NEXT: b .LBB0_3
+; CHECK-NEXT: .LBB0_3: // %bb178
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr p0, [x6]
+; CHECK-NEXT: ldr x27, [x7]
+; CHECK-NEXT: mov p8.b, p0.b
+; CHECK-NEXT: ld1w { z16.s, z24.s }, pn8/z, [x27]
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: st1w { z24.s }, p0, [x25, #1, mul vl]
+; CHECK-NEXT: st1w { z16.s }, p0, [x25]
+; CHECK-NEXT: ldr x27, [x19]
+; CHECK-NEXT: ldr x28, [x7]
+; CHECK-NEXT: add x27, x28, x27, lsl #2
+; CHECK-NEXT: str x27, [x7]
+; CHECK-NEXT: ldr p1, [x20]
+; CHECK-NEXT: ldr x27, [x21]
+; CHECK-NEXT: mov p8.b, p1.b
+; CHECK-NEXT: ld1w { z16.s, z24.s }, pn8/z, [x27]
+; CHECK-NEXT: st1w { z24.s }, p0, [x26, #1, mul vl]
+; CHECK-NEXT: st1w { z16.s }, p0, [x26]
+; CHECK-NEXT: ldr x27, [x22]
+; CHECK-NEXT: ldr x28, [x21]
+; CHECK-NEXT: add x27, x28, x27, lsl #2
+; CHECK-NEXT: str x27, [x21]
+; CHECK-NEXT: mov x27, x23
+; CHECK-NEXT: incd x27
+; CHECK-NEXT: ldr p1, [x23]
+; CHECK-NEXT: mov x28, x24
+; CHECK-NEXT: incd x28
+; CHECK-NEXT: ldr p2, [x24]
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x25]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x26]
+; CHECK-NEXT: str p1, [x1]
+; CHECK-NEXT: str p2, [x2]
+; CHECK-NEXT: st1w { z0.s }, p0, [x3]
+; CHECK-NEXT: st1w { z1.s }, p0, [x4]
+; CHECK-NEXT: ldr p1, [x1]
+; CHECK-NEXT: ldr p2, [x2]
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x3]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x4]
+; CHECK-NEXT: fmopa za0.s, p1/m, p2/m, z0.s, z1.s
+; CHECK-NEXT: ldr p1, [x27]
+; CHECK-NEXT: ldr p2, [x24]
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x25, #1, mul vl]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x26]
+; CHECK-NEXT: str p1, [x16]
+; CHECK-NEXT: str p2, [x17]
+; CHECK-NEXT: st1w { z0.s }, p0, [x18]
+; CHECK-NEXT: st1w { z1.s }, p0, [x0]
+; CHECK-NEXT: ldr p1, [x16]
+; CHECK-NEXT: ldr p2, [x17]
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x18]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT: fmopa za1.s, p1/m, p2/m, z0.s, z1.s
+; CHECK-NEXT: ldr p1, [x23]
+; CHECK-NEXT: ldr p2, [x28]
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x25]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x26, #1, mul vl]
+; CHECK-NEXT: str p1, [x12]
+; CHECK-NEXT: str p2, [x13]
+; CHECK-NEXT: st1w { z0.s }, p0, [x14]
+; CHECK-NEXT: st1w { z1.s }, p0, [x15]
+; CHECK-NEXT: ldr p1, [x12]
+; CHECK-NEXT: ldr p2, [x13]
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x14]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x15]
+; CHECK-NEXT: fmopa za2.s, p1/m, p2/m, z0.s, z1.s
+; CHECK-NEXT: ldr p1, [x27]
+; CHECK-NEXT: ldr p2, [x28]
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x25, #1, mul vl]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x26, #1, mul vl]
+; CHECK-NEXT: str p1, [x8]
+; CHECK-NEXT: str p2, [x9]
+; CHECK-NEXT: st1w { z0.s }, p0, [x10]
+; CHECK-NEXT: st1w { z1.s }, p0, [x11]
+; CHECK-NEXT: ldr p1, [x8]
+; CHECK-NEXT: ldr p2, [x9]
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x10]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x11]
+; CHECK-NEXT: fmopa za3.s, p1/m, p2/m, z0.s, z1.s
+; CHECK-NEXT: ldr x27, [x5]
+; CHECK-NEXT: subs x27, x27, #1
+; CHECK-NEXT: str x27, [x5]
+; CHECK-NEXT: b .LBB0_3
+bb:
+ %alloca = alloca <vscale x 16 x i1>, align 2
+ %alloca1 = alloca <vscale x 16 x i1>, align 2
+ %alloca2 = alloca <vscale x 4 x float>, align 16
+ %alloca3 = alloca <vscale x 4 x float>, align 16
+ %alloca4 = alloca <vscale x 16 x i1>, align 2
+ %alloca5 = alloca <vscale x 16 x i1>, align 2
+ %alloca6 = alloca <vscale x 4 x float>, align 16
+ %alloca7 = alloca <vscale x 4 x float>, align 16
+ %alloca8 = alloca <vscale x 16 x i1>, align 2
+ %alloca9 = alloca <vscale x 16 x i1>, align 2
+ %alloca10 = alloca <vscale x 4 x float>, align 16
+ %alloca11 = alloca <vscale x 4 x float>, align 16
+ %alloca12 = alloca <vscale x 16 x i1>, align 2
+ %alloca13 = alloca <vscale x 16 x i1>, align 2
+ %alloca14 = alloca <vscale x 4 x float>, align 16
+ %alloca15 = alloca <vscale x 4 x float>, align 16
+ %alloca16 = alloca i64, align 8
+ %alloca17 = alloca i64, align 8
+ %alloca18 = alloca ptr, align 8
+ %alloca19 = alloca i64, align 8
+ %alloca20 = alloca i64, align 8
+ %alloca21 = alloca target("aarch64.svcount"), align 2
+ %alloca22 = alloca i32, align 4
+ %alloca23 = alloca <vscale x 32 x i8>, align 16
+ %alloca24 = alloca i64, align 8
+ %alloca25 = alloca target("aarch64.svcount"), align 2
+ %alloca26 = alloca ptr, align 8
+ %alloca27 = alloca i64, align 8
+ %alloca28 = alloca target("aarch64.svcount"), align 2
+ %alloca29 = alloca ptr, align 8
+ %alloca30 = alloca i64, align 8
+ %alloca31 = alloca <vscale x 32 x i1>, align 2
+ %alloca32 = alloca <vscale x 32 x i1>, align 2
+ %alloca33 = alloca <vscale x 8 x float>, align 16
+ %alloca34 = alloca <vscale x 8 x float>, align 16
+ %alloca35 = alloca i64, align 8
+ %alloca36 = alloca i64, align 8
+ %alloca37 = alloca i64, align 8
+ %alloca38 = alloca i64, align 8
+ %alloca39 = alloca i64, align 8
+ %alloca40 = alloca i64, align 8
+ %alloca41 = alloca i64, align 8
+ %alloca42 = alloca i8, align 1
+ %alloca43 = alloca ptr, align 8
+ %alloca44 = alloca i64, align 8
+ %alloca45 = alloca i8, align 1
+ %alloca46 = alloca ptr, align 8
+ %alloca47 = alloca i64, align 8
+ %alloca48 = alloca ptr, align 8
+ %alloca49 = alloca i64, align 8
+ %alloca50 = alloca i8, align 1
+ %alloca51 = alloca ptr, align 8
+ %alloca52 = alloca ptr, align 8
+ %alloca53 = alloca ptr, align 8
+ %alloca54 = alloca i64, align 8
+ %alloca55 = alloca i64, align 8
+ %alloca56 = alloca i64, align 8
+ %alloca57 = alloca i64, align 8
+ %alloca58 = alloca i64, align 8
+ %alloca59 = alloca i64, align 8
+ %alloca60 = alloca i64, align 8
+ %alloca61 = alloca i64, align 8
+ %alloca62 = alloca i64, align 8
+ %alloca63 = alloca target("aarch64.svcount"), align 2
+ %alloca64 = alloca target("aarch64.svcount"), align 2
+ %alloca65 = alloca ptr, align 8
+ %alloca66 = alloca ptr, align 8
+ %alloca67 = alloca ptr, align 8
+ store i8 0, ptr %alloca42, align 1
+ store i8 0, ptr %alloca45, align 1
+ store i8 0, ptr %alloca50, align 1
+ store ptr null, ptr %alloca51, align 8
+ %load = load ptr, ptr %alloca43, align 8
+ %load68 = load i64, ptr %alloca39, align 8
+ %getelementptr = getelementptr inbounds float, ptr %load, i64 %load68
+ %load69 = load i64, ptr %alloca41, align 8
+ %sub = sub i64 %load69, 1
+ %load70 = load i64, ptr %alloca44, align 8
+ %mul = mul i64 %sub, %load70
+ %getelementptr71 = getelementptr inbounds float, ptr %getelementptr, i64 %mul
+ store ptr %getelementptr71, ptr %alloca51, align 8
+ store ptr null, ptr %alloca52, align 8
+ %load72 = load ptr, ptr %alloca46, align 8
+ %load73 = load i64, ptr %alloca40, align 8
+ %getelementptr74 = getelementptr inbounds float, ptr %load72, i64 %load73
+ %load75 = load i64, ptr %alloca41, align 8
+ %sub76 = sub i64 %load75, 1
+ %load77 = load i64, ptr %alloca47, align 8
+ %mul78 = mul i64 %sub76, %load77
+ %getelementptr79 = getelementptr inbounds float, ptr %getelementptr74, i64 %mul78
+ store ptr %getelementptr79, ptr %alloca52, align 8
+ store ptr null, ptr %alloca53, align 8
+ %load80 = load ptr, ptr %alloca48, align 8
+ %load81 = load i64, ptr %alloca39, align 8
+ %getelementptr82 = getelementptr inbounds float, ptr %load80, i64 %load81
+ %load83 = load i64, ptr %alloca40, align 8
+ %sub84 = sub i64 %load83, 1
+ %load85 = load i64, ptr %alloca49, align 8
+ %mul86 = mul i64 %sub84, %load85
+ %getelementptr87 = getelementptr inbounds float, ptr %getelementptr82, i64 %mul86
+ store ptr %getelementptr87, ptr %alloca53, align 8
+ store i64 32, ptr %alloca54, align 8
+ store i64 32, ptr %alloca55, align 8
+ store i64 0, ptr %alloca56, align 8
+ %load88 = load i64, ptr %alloca41, align 8
+ %mul89 = mul i64 32, %load88
+ store i64 %mul89, ptr %alloca56, align 8
+ %load90 = load i8, ptr %alloca42, align 1
+ %trunc = trunc i8 %load90 to i1
+ store i64 32, ptr %alloca44, align 8
+ store i64 0, ptr %alloca57, align 8
+ %load91 = load i64, ptr %alloca39, align 8
+ %sub92 = sub i64 %load91, 1
+ %udiv = udiv i64 %sub92, 32
+ %add = add i64 %udiv, 1
+ store i64 %add, ptr %alloca57, align 8
+ %load93 = load ptr, ptr %alloca43, align 8
+ %load94 = load i64, ptr %alloca57, align 8
+ %load95 = load i64, ptr %alloca56, align 8
+ %mul96 = mul i64 %load94, %load95
+ %getelementptr97 = getelementptr inbounds float, ptr %load93, i64 %mul96
+ store ptr %getelementptr97, ptr %alloca51, align 8
+ %load98 = load i8, ptr %alloca45, align 1
+ %trunc99 = trunc i8 %load98 to i1
+ store i64 32, ptr %alloca47, align 8
+ store i64 0, ptr %alloca58, align 8
+ %load100 = load i64, ptr %alloca40, align 8
+ %sub101 = sub i64 %load100, 1
+ %udiv102 = udiv i64 %sub101, 32
+ %add103 = add i64 %udiv102, 1
+ store i64 %add103, ptr %alloca58, align 8
+ %load104 = load ptr, ptr %alloca46, align 8
+ %load105 = load i64, ptr %alloca58, align 8
+ %load106 = load i64, ptr %alloca56, align 8
+ %mul107 = mul i64 %load105, %load106
+ %getelementptr108 = getelementptr inbounds float, ptr %load104, i64 %mul107
+ store ptr %getelementptr108, ptr %alloca52, align 8
+ store i64 0, ptr %alloca59, align 8
+ store i64 0, ptr %alloca59, align 8
+ %load109 = load i64, ptr %alloca59, align 8
+ %load110 = load i64, ptr %alloca40, align 8
+ %icmp = icmp ult i64 %load109, %load110
+ store i64 0, ptr %alloca60, align 8
+ store i64 0, ptr %alloca60, align 8
+ %load111 = load i64, ptr %alloca60, align 8
+ %load112 = load i64, ptr %alloca39, align 8
+ %icmp113 = icmp ult i64 %load111, %load112
+ store i64 0, ptr %alloca61, align 8
+ %load114 = load i64, ptr %alloca39, align 8
+ %load115 = load i64, ptr %alloca60, align 8
+ %sub116 = sub i64 %load114, %load115
+ store i64 %sub116, ptr %alloca35, align 8
+ store i64 32, ptr %alloca36, align 8
+ %load117 = load i64, ptr %alloca35, align 8
+ %load118 = load i64, ptr %alloca36, align 8
+ %icmp119 = icmp ult i64 %load117, %load118
+ %load120 = load i64, ptr %alloca35, align 8
+ store i64 %load120, ptr %alloca61, align 8
+ store i64 0, ptr %alloca62, align 8
+ %load121 = load i64, ptr %alloca40, align 8
+ %load122 = load i64, ptr %alloca59, align 8
+ %sub123 = sub i64 %load121, %load122
+ store i64 %sub123, ptr %alloca37, align 8
+ store i64 32, ptr %alloca38, align 8
+ %load124 = load i64, ptr %alloca37, align 8
+ %load125 = load i64, ptr %alloca38, align 8
+ %icmp126 = icmp ult i64 %load124, %load125
+ %load127 = load i64, ptr %alloca37, align 8
+ store i64 %load127, ptr %alloca62, align 8
+ %load128 = load i64, ptr %alloca60, align 8
+ %load129 = load i64, ptr %alloca39, align 8
+ %call = call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c32(i64 %load128, i64 %load129, i32 2)
+ store target("aarch64.svcount") %call, ptr %alloca63, align 2
+ %load130 = load i64, ptr %alloca59, align 8
+ %load131 = load i64, ptr %alloca40, align 8
+ %call132 = call target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c32(i64 %load130, i64 %load131, i32 2)
+ store target("aarch64.svcount") %call132, ptr %alloca64, align 2
+ store ptr null, ptr %alloca65, align 8
+ %load133 = load ptr, ptr %alloca48, align 8
+ %load134 = load i64, ptr %alloca60, align 8
+ %getelementptr135 = getelementptr inbounds float, ptr %load133, i64 %load134
+ %load136 = load i64, ptr %alloca59, align 8
+ %load137 = load i64, ptr %alloca49, align 8
+ %mul138 = mul i64 %load136, %load137
+ %getelementptr139 = getelementptr inbounds float, ptr %getelementptr135, i64 %mul138
+ store ptr %getelementptr139, ptr %alloca65, align 8
+ call void @llvm.aarch64.sme.zero(i32 255)
+ store ptr null, ptr %alloca66, align 8
+ %load140 = load i8, ptr %alloca42, align 1
+ %trunc141 = trunc i8 %load140 to i1
+ %load142 = load ptr, ptr %alloca43, align 8
+ %load143 = load i64, ptr %alloca60, align 8
+ %load144 = load i64, ptr %alloca41, align 8
+ %mul145 = mul i64 %load143, %load144
+ %getelementptr146 = getelementptr inbounds float, ptr %load142, i64 %mul145
+ store ptr %getelementptr146, ptr %alloca66, align 8
+ store ptr null, ptr %alloca67, align 8
+ %load147 = load i8, ptr %alloca45, align 1
+ %trunc148 = trunc i8 %load147 to i1
+ %load149 = load ptr, ptr %alloca46, align 8
+ %load150 = load i64, ptr %alloca59, align 8
+ %load151 = load i64, ptr %alloca41, align 8
+ %mul152 = mul i64 %load150, %load151
+ %getelementptr153 = getelementptr inbounds float, ptr %load149, i64 %mul152
+ store ptr %getelementptr153, ptr %alloca67, align 8
+ %load154 = load i64, ptr %alloca41, align 8
+ %load155 = load target("aarch64.svcount"), ptr %alloca63, align 2
+ %load156 = load ptr, ptr %alloca66, align 8
+ %load157 = load i64, ptr %alloca44, align 8
+ %udiv158 = udiv i64 %load157, 4
+ %load159 = load target("aarch64.svcount"), ptr %alloca64, align 2
+ %load160 = load ptr, ptr %alloca67, align 8
+ %load161 = load i64, ptr %alloca47, align 8
+ %udiv162 = udiv i64 %load161, 4
+ store i64 %load154, ptr %alloca24, align 8
+ store target("aarch64.svcount") %load155, ptr %alloca25, align 2
+ store ptr %load156, ptr %alloca26, align 8
+ store i64 %udiv158, ptr %alloca27, align 8
+ store target("aarch64.svcount") %load159, ptr %alloca28, align 2
+ store ptr %load160, ptr %alloca29, align 8
+ store i64 %udiv162, ptr %alloca30, align 8
+ %load163 = load target("aarch64.svcount"), ptr %alloca25, align 2
+ %call164 = call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.pext.x2.nxv4i1(target("aarch64.svcount") %load163, i32 0)
+ %extractvalue = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } %call164, 0
+ %call165 = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %extractvalue)
+ %call166 = call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> %call165, i64 0)
+ %extractvalue167 = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } %call164, 1
+ %call168 = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %extractvalue167)
+ %call169 = call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> %call166, <vscale x 16 x i1> %call168, i64 16)
+ store <vscale x 32 x i1> %call169, ptr %alloca31, align 2
+ %load170 = load target("aarch64.svcount"), ptr %alloca28, align 2
+ %call171 = call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.pext.x2.nxv4i1(target("aarch64.svcount") %load170, i32 0)
+ %extractvalue172 = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } %call171, 0
+ %call173 = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %extractvalue172)
+ %call174 = call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> %call173, i64 0)
+ %extractvalue175 = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } %call171, 1
+ %call176 = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %extractvalue175)
+ %call177 = call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> %call174, <vscale x 16 x i1> %call176, i64 16)
+ store <vscale x 32 x i1> %call177, ptr %alloca32, align 2
+ br label %bb178
+
+bb178: ; preds = %bb178, %bb
+ %load179 = load i64, ptr %alloca24, align 8
+ %icmp180 = icmp ugt i64 %load179, 0
+ %load181 = load target("aarch64.svcount"), ptr %alloca25, align 2
+ %load182 = load ptr, ptr %alloca26, align 8
+ %call183 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") %load181, ptr %load182)
+ %extractvalue184 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %call183, 0
+ %call185 = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> %extractvalue184, i64 0)
+ %extractvalue186 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %call183, 1
+ %call187 = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> %call185, <vscale x 4 x float> %extractvalue186, i64 4)
+ store <vscale x 8 x float> %call187, ptr %alloca33, align 16
+ %load188 = load i64, ptr %alloca27, align 8
+ %load189 = load ptr, ptr %alloca26, align 8
+ %getelementptr190 = getelementptr inbounds float, ptr %load189, i64 %load188
+ store ptr %getelementptr190, ptr %alloca26, align 8
+ %load191 = load target("aarch64.svcount"), ptr %alloca28, align 2
+ %load192 = load ptr, ptr %alloca29, align 8
+ %call193 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") %load191, ptr %load192)
+ %extractvalue194 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %call193, 0
+ %call195 = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> %extractvalue194, i64 0)
+ %extractvalue196 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %call193, 1
+ %call197 = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> %call195, <vscale x 4 x float> %extractvalue196, i64 4)
+ store <vscale x 8 x float> %call197, ptr %alloca34, align 16
+ %load198 = load i64, ptr %alloca30, align 8
+ %load199 = load ptr, ptr %alloca29, align 8
+ %getelementptr200 = getelementptr inbounds float, ptr %load199, i64 %load198
+ store ptr %getelementptr200, ptr %alloca29, align 8
+ %load201 = load <vscale x 32 x i1>, ptr %alloca31, align 2
+ %call202 = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %load201, i64 0)
+ %load203 = load <vscale x 32 x i1>, ptr %alloca32, align 2
+ %call204 = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %load203, i64 0)
+ %load205 = load <vscale x 8 x float>, ptr %alloca33, align 16
+ %call206 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> %load205, i64 0)
+ %load207 = load <vscale x 8 x float>, ptr %alloca34, align 16
+ %call208 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> %load207, i64 0)
+ store <vscale x 16 x i1> %call202, ptr %alloca12, align 2
+ store <vscale x 16 x i1> %call204, ptr %alloca13, align 2
+ store <vscale x 4 x float> %call206, ptr %alloca14, align 16
+ store <vscale x 4 x float> %call208, ptr %alloca15, align 16
+ %load209 = load <vscale x 16 x i1>, ptr %alloca12, align 2
+ %load210 = load <vscale x 16 x i1>, ptr %alloca13, align 2
+ %load211 = load <vscale x 4 x float>, ptr %alloca14, align 16
+ %load212 = load <vscale x 4 x float>, ptr %alloca15, align 16
+ %call213 = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %load209)
+ %call214 = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %load210)
+ call void @llvm.aarch64.sme.mopa.nxv4f32(i32 0, <vscale x 4 x i1> %call213, <vscale x 4 x i1> %call214, <vscale x 4 x float> %load211, <vscale x 4 x float> %load212)
+ %load215 = load <vscale x 32 x i1>, ptr %alloca31, align 2
+ %call216 = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %load215, i64 16)
+ %load217 = load <vscale x 32 x i1>, ptr %alloca32, align 2
+ %call218 = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %load217, i64 0)
+ %load219 = load <vscale x 8 x float>, ptr %alloca33, align 16
+ %call220 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> %load219, i64 4)
+ %load221 = load <vscale x 8 x float>, ptr %alloca34, align 16
+ %call222 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> %load221, i64 0)
+ store <vscale x 16 x i1> %call216, ptr %alloca8, align 2
+ store <vscale x 16 x i1> %call218, ptr %alloca9, align 2
+ store <vscale x 4 x float> %call220, ptr %alloca10, align 16
+ store <vscale x 4 x float> %call222, ptr %alloca11, align 16
+ %load223 = load <vscale x 16 x i1>, ptr %alloca8, align 2
+ %load224 = load <vscale x 16 x i1>, ptr %alloca9, align 2
+ %load225 = load <vscale x 4 x float>, ptr %alloca10, align 16
+ %load226 = load <vscale x 4 x float>, ptr %alloca11, align 16
+ %call227 = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %load223)
+ %call228 = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %load224)
+ call void @llvm.aarch64.sme.mopa.nxv4f32(i32 1, <vscale x 4 x i1> %call227, <vscale x 4 x i1> %call228, <vscale x 4 x float> %load225, <vscale x 4 x float> %load226)
+ %load229 = load <vscale x 32 x i1>, ptr %alloca31, align 2
+ %call230 = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %load229, i64 0)
+ %load231 = load <vscale x 32 x i1>, ptr %alloca32, align 2
+ %call232 = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %load231, i64 16)
+ %load233 = load <vscale x 8 x float>, ptr %alloca33, align 16
+ %call234 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> %load233, i64 0)
+ %load235 = load <vscale x 8 x float>, ptr %alloca34, align 16
+ %call236 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> %load235, i64 4)
+ store <vscale x 16 x i1> %call230, ptr %alloca4, align 2
+ store <vscale x 16 x i1> %call232, ptr %alloca5, align 2
+ store <vscale x 4 x float> %call234, ptr %alloca6, align 16
+ store <vscale x 4 x float> %call236, ptr %alloca7, align 16
+ %load237 = load <vscale x 16 x i1>, ptr %alloca4, align 2
+ %load238 = load <vscale x 16 x i1>, ptr %alloca5, align 2
+ %load239 = load <vscale x 4 x float>, ptr %alloca6, align 16
+ %load240 = load <vscale x 4 x float>, ptr %alloca7, align 16
+ %call241 = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %load237)
+ %call242 = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %load238)
+ call void @llvm.aarch64.sme.mopa.nxv4f32(i32 2, <vscale x 4 x i1> %call241, <vscale x 4 x i1> %call242, <vscale x 4 x float> %load239, <vscale x 4 x float> %load240)
+ %load243 = load <vscale x 32 x i1>, ptr %alloca31, align 2
+ %call244 = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %load243, i64 16)
+ %load245 = load <vscale x 32 x i1>, ptr %alloca32, align 2
+ %call246 = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> %load245, i64 16)
+ %load247 = load <vscale x 8 x float>, ptr %alloca33, align 16
+ %call248 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> %load247, i64 4)
+ %load249 = load <vscale x 8 x float>, ptr %alloca34, align 16
+ %call250 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> %load249, i64 4)
+ store <vscale x 16 x i1> %call244, ptr %alloca, align 2
+ store <vscale x 16 x i1> %call246, ptr %alloca1, align 2
+ store <vscale x 4 x float> %call248, ptr %alloca2, align 16
+ store <vscale x 4 x float> %call250, ptr %alloca3, align 16
+ %load251 = load <vscale x 16 x i1>, ptr %alloca, align 2
+ %load252 = load <vscale x 16 x i1>, ptr %alloca1, align 2
+ %load253 = load <vscale x 4 x float>, ptr %alloca2, align 16
+ %load254 = load <vscale x 4 x float>, ptr %alloca3, align 16
+ %call255 = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %load251)
+ %call256 = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %load252)
+ call void @llvm.aarch64.sme.mopa.nxv4f32(i32 3, <vscale x 4 x i1> %call255, <vscale x 4 x i1> %call256, <vscale x 4 x float> %load253, <vscale x 4 x float> %load254)
+ %load257 = load i64, ptr %alloca24, align 8
+ %add258 = add i64 %load257, -1
+ store i64 %add258, ptr %alloca24, align 8
+ br label %bb178
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c32(i64, i64, i32 immarg) #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare void @llvm.aarch64.sme.zero(i32 immarg) #3
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.pext.x2.nxv4i1(target("aarch64.svcount"), i32 immarg) #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1>) #2
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1>, <vscale x 16 x i1>, i64 immarg) #4
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount"), ptr) #5
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float>, <vscale x 4 x float>, i64 immarg) #4
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1>, i64 immarg) #4
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float>, i64 immarg) #4
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1>) #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare void @llvm.aarch64.sme.mopa.nxv4f32(i32 immarg, <vscale x 4 x i1>, <vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>) #3
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare target("aarch64.svcount") @llvm.aarch64.sve.whilelt.c8(i64, i64, i32 immarg) #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32, i32) #3
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8>, <vscale x 16 x i8>, i64 immarg) #4
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8>, i64 immarg) #4
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write)
+declare void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, target("aarch64.svcount"), ptr) #6
+
+attributes #0 = { cold noreturn nounwind }
+attributes #1 = { mustprogress noinline optnone ssp uwtable(sync) vscale_range(1,16) "aarch64_new_za" "aarch64_pstate_sm_enabled" "frame-pointer"="non-leaf" "target-features"="+fp-armv8,+fullfp16,+sme,+sme-f64f64,+sme2" }
+attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #3 = { nocallback nofree nosync nounwind willreturn }
+attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
>From 7292d0b2913f16a5cfc18e30ec351468ed13fbff Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara at apple.com>
Date: Thu, 9 May 2024 11:46:21 -0700
Subject: [PATCH 2/2] [AArch64][SME] Fix frame lowering not using a base
pointer for SME functions.
The existing code is checking for the presence of the +sve subtarget feature
when deciding to use a base pointer for the function, but this check doesn't
work when only +sme is used.
rdar://126878490
---
.../Target/AArch64/AArch64RegisterInfo.cpp | 3 +-
.../AArch64/sme-disable-gisel-fisel.ll | 24 +-
.../CodeGen/AArch64/sme-framelower-use-bp.ll | 930 ++++++++++--------
.../CodeGen/AArch64/sme-lazy-save-call.ll | 46 +-
.../AArch64/sme-shared-za-interface.ll | 12 +-
llvm/test/CodeGen/AArch64/sme-zt0-state.ll | 44 +-
6 files changed, 571 insertions(+), 488 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index ad29003f1e817..a192e01f69b20 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -552,7 +552,8 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
if (hasStackRealignment(MF))
return true;
- if (MF.getSubtarget<AArch64Subtarget>().hasSVE()) {
+ auto &ST = MF.getSubtarget<AArch64Subtarget>();
+ if (ST.hasSVEorSME()) {
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
// Frames that have variable sized objects and scalable SVE objects,
// should always use a basepointer.
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index 254e37e836cbb..50d04e39f3527 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -214,7 +214,8 @@ declare double @za_shared_callee(double) "aarch64_inout_za"
define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline optnone "aarch64_new_za"{
; CHECK-COMMON-LABEL: za_new_caller_to_za_shared_callee:
; CHECK-COMMON: // %bb.0: // %prelude
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: mov x29, sp
; CHECK-COMMON-NEXT: sub sp, sp, #16
; CHECK-COMMON-NEXT: rdsvl x8, #1
@@ -240,7 +241,8 @@ define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o
; CHECK-COMMON-NEXT: fadd d0, d0, d1
; CHECK-COMMON-NEXT: smstop za
; CHECK-COMMON-NEXT: mov sp, x29
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ret
entry:
%call = call double @za_shared_callee(double %x)
@@ -251,7 +253,8 @@ entry:
define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline optnone "aarch64_inout_za"{
; CHECK-COMMON-LABEL: za_shared_caller_to_za_none_callee:
; CHECK-COMMON: // %bb.0: // %entry
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: mov x29, sp
; CHECK-COMMON-NEXT: sub sp, sp, #16
; CHECK-COMMON-NEXT: rdsvl x8, #1
@@ -279,7 +282,8 @@ define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline
; CHECK-COMMON-NEXT: fmov d1, x8
; CHECK-COMMON-NEXT: fadd d0, d0, d1
; CHECK-COMMON-NEXT: mov sp, x29
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ret
entry:
%call = call double @normal_callee(double %x)
@@ -291,7 +295,8 @@ entry:
define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
; CHECK-COMMON-LABEL: f128_call_za:
; CHECK-COMMON: // %bb.0:
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: mov x29, sp
; CHECK-COMMON-NEXT: sub sp, sp, #16
; CHECK-COMMON-NEXT: rdsvl x8, #1
@@ -314,7 +319,8 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
; CHECK-COMMON-NEXT: .LBB8_2:
; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr
; CHECK-COMMON-NEXT: mov sp, x29
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ret
%res = fadd fp128 %a, %b
ret fp128 %res
@@ -353,7 +359,8 @@ define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounw
define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind {
; CHECK-COMMON-LABEL: frem_call_za:
; CHECK-COMMON: // %bb.0:
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: mov x29, sp
; CHECK-COMMON-NEXT: sub sp, sp, #16
; CHECK-COMMON-NEXT: rdsvl x8, #1
@@ -376,7 +383,8 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind {
; CHECK-COMMON-NEXT: .LBB10_2:
; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr
; CHECK-COMMON-NEXT: mov sp, x29
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ret
%res = frem double %a, %b
ret double %res
diff --git a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
index 7050f80f46e0b..4818a8c4db7c4 100644
--- a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
+++ b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
@@ -15,7 +15,7 @@ define void @quux() #1 {
; CHECK-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: mov x29, sp
-; CHECK-NEXT: sub sp, sp, #384
+; CHECK-NEXT: sub sp, sp, #464
; CHECK-NEXT: .cfi_def_cfa w29, 96
; CHECK-NEXT: .cfi_offset w19, -8
; CHECK-NEXT: .cfi_offset w20, -16
@@ -46,18 +46,19 @@ define void @quux() #1 {
; CHECK-NEXT: .LBB0_2: // %bb
; CHECK-NEXT: smstart za
; CHECK-NEXT: zero {za}
-; CHECK-NEXT: mov w23, #15 // =0xf
+; CHECK-NEXT: mov w24, #15 // =0xf
; CHECK-NEXT: mov w8, #15 // =0xf
; CHECK-NEXT: incd x8
; CHECK-NEXT: // kill: def $w8 killed $w8 killed $x8 def $x8
-; CHECK-NEXT: and x28, x8, #0x70
+; CHECK-NEXT: and x10, x8, #0x70
; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: subs x8, x8, x28
+; CHECK-NEXT: subs x8, x8, x10
; CHECK-NEXT: mov sp, x8
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: subs x9, x9, x28
+; CHECK-NEXT: subs x9, x9, x10
+; CHECK-NEXT: mov x3, x10
; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: addvl x10, x23, #1
+; CHECK-NEXT: addvl x10, x24, #1
; CHECK-NEXT: // kill: def $w10 killed $w10 killed $x10 def $x10
; CHECK-NEXT: and x4, x10, #0x3f0
; CHECK-NEXT: mov x10, sp
@@ -67,10 +68,10 @@ define void @quux() #1 {
; CHECK-NEXT: subs x11, x11, x4
; CHECK-NEXT: mov sp, x11
; CHECK-NEXT: mov x12, sp
-; CHECK-NEXT: subs x12, x12, x28
+; CHECK-NEXT: subs x12, x12, x3
; CHECK-NEXT: mov sp, x12
; CHECK-NEXT: mov x13, sp
-; CHECK-NEXT: subs x13, x13, x28
+; CHECK-NEXT: subs x13, x13, x3
; CHECK-NEXT: mov sp, x13
; CHECK-NEXT: mov x14, sp
; CHECK-NEXT: subs x14, x14, x4
@@ -79,10 +80,10 @@ define void @quux() #1 {
; CHECK-NEXT: subs x15, x15, x4
; CHECK-NEXT: mov sp, x15
; CHECK-NEXT: mov x16, sp
-; CHECK-NEXT: subs x16, x16, x28
+; CHECK-NEXT: subs x16, x16, x3
; CHECK-NEXT: mov sp, x16
; CHECK-NEXT: mov x17, sp
-; CHECK-NEXT: subs x17, x17, x28
+; CHECK-NEXT: subs x17, x17, x3
; CHECK-NEXT: mov sp, x17
; CHECK-NEXT: mov x18, sp
; CHECK-NEXT: subs x18, x18, x4
@@ -91,10 +92,12 @@ define void @quux() #1 {
; CHECK-NEXT: subs x0, x0, x4
; CHECK-NEXT: mov sp, x0
; CHECK-NEXT: mov x1, sp
-; CHECK-NEXT: subs x1, x1, x28
+; CHECK-NEXT: subs x1, x1, x3
; CHECK-NEXT: mov sp, x1
; CHECK-NEXT: mov x2, sp
-; CHECK-NEXT: subs x2, x2, x28
+; CHECK-NEXT: subs x2, x2, x3
+; CHECK-NEXT: mov x22, x3
+; CHECK-NEXT: stur x22, [x29, #-184] // 8-byte Folded Spill
; CHECK-NEXT: mov sp, x2
; CHECK-NEXT: mov x3, sp
; CHECK-NEXT: subs x3, x3, x4
@@ -113,510 +116,563 @@ define void @quux() #1 {
; CHECK-NEXT: subs x5, sp, #16
; CHECK-NEXT: mov sp, x5
; CHECK-NEXT: mov x5, sp
-; CHECK-NEXT: subs x5, x5, x28
+; CHECK-NEXT: subs x5, x5, x22
; CHECK-NEXT: mov sp, x5
; CHECK-NEXT: subs x5, sp, #16
; CHECK-NEXT: mov sp, x5
-; CHECK-NEXT: addvl x5, x23, #2
+; CHECK-NEXT: addvl x5, x24, #2
; CHECK-NEXT: // kill: def $w5 killed $w5 killed $x5 def $x5
-; CHECK-NEXT: and x26, x5, #0x7f0
+; CHECK-NEXT: and x27, x5, #0x7f0
; CHECK-NEXT: mov x5, sp
-; CHECK-NEXT: subs x5, x5, x26
+; CHECK-NEXT: subs x5, x5, x27
; CHECK-NEXT: and sp, x5, #0xffffffffffffffe0
; CHECK-NEXT: mov x5, sp
; CHECK-NEXT: stur x5, [x29, #-40] // 8-byte Folded Spill
; CHECK-NEXT: subs x5, x5, #16
; CHECK-NEXT: mov sp, x5
; CHECK-NEXT: mov x6, sp
-; CHECK-NEXT: subs x6, x6, x28
+; CHECK-NEXT: subs x6, x6, x22
; CHECK-NEXT: mov sp, x6
; CHECK-NEXT: mov x7, sp
; CHECK-NEXT: stur x7, [x29, #-48] // 8-byte Folded Spill
; CHECK-NEXT: subs x7, x7, #16
; CHECK-NEXT: mov sp, x7
-; CHECK-NEXT: mov x19, sp
-; CHECK-NEXT: stur x19, [x29, #-56] // 8-byte Folded Spill
-; CHECK-NEXT: subs x19, x19, #16
-; CHECK-NEXT: mov sp, x19
; CHECK-NEXT: mov x20, sp
-; CHECK-NEXT: subs x20, x20, x28
+; CHECK-NEXT: stur x20, [x29, #-56] // 8-byte Folded Spill
+; CHECK-NEXT: subs x20, x20, #16
; CHECK-NEXT: mov sp, x20
; CHECK-NEXT: mov x21, sp
-; CHECK-NEXT: stur x21, [x29, #-64] // 8-byte Folded Spill
-; CHECK-NEXT: subs x21, x21, #16
+; CHECK-NEXT: subs x21, x21, x22
; CHECK-NEXT: mov sp, x21
; CHECK-NEXT: mov x22, sp
-; CHECK-NEXT: stur x22, [x29, #-72] // 8-byte Folded Spill
+; CHECK-NEXT: stur x22, [x29, #-64] // 8-byte Folded Spill
; CHECK-NEXT: subs x22, x22, #16
; CHECK-NEXT: mov sp, x22
-; CHECK-NEXT: incw x23
-; CHECK-NEXT: // kill: def $w23 killed $w23 killed $x23 def $x23
-; CHECK-NEXT: and x24, x23, #0xf0
; CHECK-NEXT: mov x23, sp
-; CHECK-NEXT: subs x23, x23, x24
+; CHECK-NEXT: stur x23, [x29, #-72] // 8-byte Folded Spill
+; CHECK-NEXT: subs x23, x23, #16
; CHECK-NEXT: mov sp, x23
-; CHECK-NEXT: mov x25, sp
-; CHECK-NEXT: subs x24, x25, x24
+; CHECK-NEXT: incw x24
+; CHECK-NEXT: // kill: def $w24 killed $w24 killed $x24 def $x24
+; CHECK-NEXT: and x25, x24, #0xf0
+; CHECK-NEXT: mov x24, sp
+; CHECK-NEXT: subs x24, x24, x25
; CHECK-NEXT: mov sp, x24
-; CHECK-NEXT: mov x25, sp
-; CHECK-NEXT: subs x25, x25, x26
-; CHECK-NEXT: and x25, x25, #0xffffffffffffffe0
+; CHECK-NEXT: mov x26, sp
+; CHECK-NEXT: subs x25, x26, x25
; CHECK-NEXT: mov sp, x25
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: subs x26, x27, x26
+; CHECK-NEXT: mov x26, sp
+; CHECK-NEXT: subs x26, x26, x27
; CHECK-NEXT: and x26, x26, #0xffffffffffffffe0
; CHECK-NEXT: mov sp, x26
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: stur x27, [x29, #-96] // 8-byte Folded Spill
-; CHECK-NEXT: subs x27, x27, #16
+; CHECK-NEXT: mov x28, sp
+; CHECK-NEXT: subs x27, x28, x27
+; CHECK-NEXT: and x27, x27, #0xffffffffffffffe0
; CHECK-NEXT: mov sp, x27
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: stur x27, [x29, #-112] // 8-byte Folded Spill
-; CHECK-NEXT: subs x27, x27, #16
-; CHECK-NEXT: mov sp, x27
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: stur x27, [x29, #-104] // 8-byte Folded Spill
-; CHECK-NEXT: subs x27, x27, #16
-; CHECK-NEXT: mov sp, x27
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: stur x27, [x29, #-120] // 8-byte Folded Spill
-; CHECK-NEXT: subs x27, x27, #16
-; CHECK-NEXT: mov sp, x27
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: stur x27, [x29, #-160] // 8-byte Folded Spill
-; CHECK-NEXT: subs x27, x27, #16
-; CHECK-NEXT: mov sp, x27
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: stur x27, [x29, #-152] // 8-byte Folded Spill
-; CHECK-NEXT: subs x27, x27, #16
-; CHECK-NEXT: mov sp, x27
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: stur x27, [x29, #-80] // 8-byte Folded Spill
-; CHECK-NEXT: subs x27, x27, #16
-; CHECK-NEXT: mov sp, x27
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: stur x27, [x29, #-224] // 8-byte Folded Spill
-; CHECK-NEXT: subs x27, x27, #16
-; CHECK-NEXT: mov sp, x27
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: stur x27, [x29, #-128] // 8-byte Folded Spill
-; CHECK-NEXT: subs x27, x27, #16
-; CHECK-NEXT: mov sp, x27
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: stur x27, [x29, #-88] // 8-byte Folded Spill
-; CHECK-NEXT: subs x27, x27, #16
-; CHECK-NEXT: mov sp, x27
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: stur x27, [x29, #-248] // 8-byte Folded Spill
-; CHECK-NEXT: subs x27, x27, #16
-; CHECK-NEXT: mov sp, x27
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: stur x27, [x29, #-144] // 8-byte Folded Spill
-; CHECK-NEXT: subs x27, x27, #16
-; CHECK-NEXT: mov sp, x27
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: stur x27, [x29, #-136] // 8-byte Folded Spill
-; CHECK-NEXT: subs x27, x27, #16
-; CHECK-NEXT: mov sp, x27
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: stur x27, [x29, #-168] // 8-byte Folded Spill
-; CHECK-NEXT: subs x27, x27, #16
-; CHECK-NEXT: mov sp, x27
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: stur x27, [x29, #-176] // 8-byte Folded Spill
-; CHECK-NEXT: subs x27, x27, #16
-; CHECK-NEXT: mov sp, x27
-; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: mov x28, sp
+; CHECK-NEXT: stur x28, [x29, #-96] // 8-byte Folded Spill
+; CHECK-NEXT: subs x28, x28, #16
+; CHECK-NEXT: mov sp, x28
+; CHECK-NEXT: mov x28, sp
+; CHECK-NEXT: stur x28, [x29, #-112] // 8-byte Folded Spill
+; CHECK-NEXT: subs x28, x28, #16
+; CHECK-NEXT: mov sp, x28
+; CHECK-NEXT: mov x28, sp
+; CHECK-NEXT: stur x28, [x29, #-104] // 8-byte Folded Spill
+; CHECK-NEXT: subs x28, x28, #16
+; CHECK-NEXT: mov sp, x28
+; CHECK-NEXT: mov x28, sp
+; CHECK-NEXT: stur x28, [x29, #-120] // 8-byte Folded Spill
+; CHECK-NEXT: subs x28, x28, #16
+; CHECK-NEXT: mov sp, x28
+; CHECK-NEXT: mov x28, sp
+; CHECK-NEXT: stur x28, [x29, #-128] // 8-byte Folded Spill
+; CHECK-NEXT: subs x28, x28, #16
+; CHECK-NEXT: mov sp, x28
+; CHECK-NEXT: mov x28, sp
+; CHECK-NEXT: stur x28, [x29, #-136] // 8-byte Folded Spill
+; CHECK-NEXT: subs x28, x28, #16
+; CHECK-NEXT: mov sp, x28
+; CHECK-NEXT: mov x28, sp
+; CHECK-NEXT: stur x28, [x29, #-80] // 8-byte Folded Spill
+; CHECK-NEXT: subs x28, x28, #16
+; CHECK-NEXT: mov sp, x28
+; CHECK-NEXT: mov x28, sp
+; CHECK-NEXT: stur x28, [x29, #-232] // 8-byte Folded Spill
+; CHECK-NEXT: subs x28, x28, #16
+; CHECK-NEXT: mov sp, x28
+; CHECK-NEXT: mov x28, sp
+; CHECK-NEXT: stur x28, [x29, #-144] // 8-byte Folded Spill
+; CHECK-NEXT: subs x28, x28, #16
+; CHECK-NEXT: mov sp, x28
+; CHECK-NEXT: mov x28, sp
+; CHECK-NEXT: stur x28, [x29, #-88] // 8-byte Folded Spill
+; CHECK-NEXT: subs x28, x28, #16
+; CHECK-NEXT: mov sp, x28
+; CHECK-NEXT: mov x28, sp
+; CHECK-NEXT: sub x30, x29, #16
+; CHECK-NEXT: stur x28, [x30, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: subs x28, x28, #16
+; CHECK-NEXT: mov sp, x28
+; CHECK-NEXT: mov x28, sp
+; CHECK-NEXT: stur x28, [x29, #-160] // 8-byte Folded Spill
+; CHECK-NEXT: subs x28, x28, #16
+; CHECK-NEXT: mov sp, x28
+; CHECK-NEXT: mov x28, sp
+; CHECK-NEXT: stur x28, [x29, #-152] // 8-byte Folded Spill
+; CHECK-NEXT: subs x28, x28, #16
+; CHECK-NEXT: mov sp, x28
+; CHECK-NEXT: mov x28, sp
+; CHECK-NEXT: stur x28, [x29, #-168] // 8-byte Folded Spill
+; CHECK-NEXT: subs x28, x28, #16
+; CHECK-NEXT: mov sp, x28
+; CHECK-NEXT: mov x28, sp
+; CHECK-NEXT: stur x28, [x29, #-176] // 8-byte Folded Spill
+; CHECK-NEXT: subs x28, x28, #16
+; CHECK-NEXT: mov sp, x28
+; CHECK-NEXT: mov x28, sp
+; CHECK-NEXT: sub x30, x29, #88
+; CHECK-NEXT: stur x28, [x30, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: subs x28, x28, #16
+; CHECK-NEXT: mov sp, x28
+; CHECK-NEXT: mov x28, sp
+; CHECK-NEXT: stur x28, [x29, #-216] // 8-byte Folded Spill
+; CHECK-NEXT: subs x28, x28, #16
+; CHECK-NEXT: mov sp, x28
+; CHECK-NEXT: mov x28, sp
+; CHECK-NEXT: stur x28, [x29, #-192] // 8-byte Folded Spill
+; CHECK-NEXT: subs x28, x28, #16
+; CHECK-NEXT: mov sp, x28
+; CHECK-NEXT: mov x28, sp
+; CHECK-NEXT: sub x30, x29, #48
+; CHECK-NEXT: stur x28, [x30, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: subs x28, x28, #16
+; CHECK-NEXT: mov sp, x28
+; CHECK-NEXT: mov x28, sp
+; CHECK-NEXT: sub x30, x29, #56
+; CHECK-NEXT: stur x28, [x30, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: subs x28, x28, #16
+; CHECK-NEXT: mov sp, x28
+; CHECK-NEXT: mov x28, sp
; CHECK-NEXT: sub x30, x29, #80
-; CHECK-NEXT: stur x27, [x30, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: subs x27, x27, #16
-; CHECK-NEXT: mov sp, x27
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: stur x27, [x29, #-208] // 8-byte Folded Spill
-; CHECK-NEXT: subs x27, x27, #16
-; CHECK-NEXT: mov sp, x27
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: stur x27, [x29, #-184] // 8-byte Folded Spill
-; CHECK-NEXT: subs x27, x27, #16
-; CHECK-NEXT: mov sp, x27
-; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: stur x28, [x30, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: subs x28, x28, #16
+; CHECK-NEXT: mov sp, x28
+; CHECK-NEXT: mov x28, sp
+; CHECK-NEXT: stur x28, [x29, #-240] // 8-byte Folded Spill
+; CHECK-NEXT: subs x28, x28, #16
+; CHECK-NEXT: mov sp, x28
+; CHECK-NEXT: mov x28, sp
+; CHECK-NEXT: sub x30, x29, #72
+; CHECK-NEXT: stur x28, [x30, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: subs x28, x28, #16
+; CHECK-NEXT: mov sp, x28
+; CHECK-NEXT: mov x28, sp
+; CHECK-NEXT: sub x30, x29, #64
+; CHECK-NEXT: stur x28, [x30, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: subs x28, x28, #16
+; CHECK-NEXT: mov sp, x28
+; CHECK-NEXT: mov x28, sp
; CHECK-NEXT: sub x30, x29, #32
-; CHECK-NEXT: stur x27, [x30, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: subs x27, x27, #16
-; CHECK-NEXT: mov sp, x27
-; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: stur x28, [x30, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: subs x28, x28, #16
+; CHECK-NEXT: mov sp, x28
+; CHECK-NEXT: mov x28, sp
; CHECK-NEXT: sub x30, x29, #40
-; CHECK-NEXT: stur x27, [x30, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: subs x27, x27, #16
-; CHECK-NEXT: mov sp, x27
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: sub x30, x29, #56
-; CHECK-NEXT: stur x27, [x30, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: subs x27, x27, #16
-; CHECK-NEXT: mov sp, x27
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: sub x30, x29, #64
-; CHECK-NEXT: stur x27, [x30, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: subs x27, x27, #16
-; CHECK-NEXT: mov sp, x27
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: sub x30, x29, #72
-; CHECK-NEXT: stur x27, [x30, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: subs x27, x27, #16
-; CHECK-NEXT: mov sp, x27
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: sub x30, x29, #48
-; CHECK-NEXT: stur x27, [x30, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: subs x27, x27, #16
-; CHECK-NEXT: mov sp, x27
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: sub x30, x29, #8
-; CHECK-NEXT: stur x27, [x30, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: subs x27, x27, #16
-; CHECK-NEXT: mov sp, x27
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: sub x30, x29, #16
-; CHECK-NEXT: stur x27, [x30, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: subs x30, x27, #16
+; CHECK-NEXT: stur x28, [x30, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: subs x30, x28, #16
; CHECK-NEXT: mov sp, x30
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: stur x27, [x29, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: subs x30, x27, #16
-; CHECK-NEXT: mov sp, x30
-; CHECK-NEXT: mov x27, sp
+; CHECK-NEXT: mov x28, sp
; CHECK-NEXT: sub x30, x29, #24
-; CHECK-NEXT: stur x27, [x30, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: subs x30, x27, #16
+; CHECK-NEXT: stur x28, [x30, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: subs x30, x28, #16
+; CHECK-NEXT: mov sp, x30
+; CHECK-NEXT: mov x28, sp
+; CHECK-NEXT: sub x30, x29, #8
+; CHECK-NEXT: stur x28, [x30, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: subs x30, x28, #16
; CHECK-NEXT: mov sp, x30
; CHECK-NEXT: mov x30, sp
-; CHECK-NEXT: subs x27, x30, x28
-; CHECK-NEXT: stur x27, [x29, #-192] // 8-byte Folded Spill
-; CHECK-NEXT: mov sp, x27
+; CHECK-NEXT: ldur x28, [x29, #-184] // 8-byte Folded Reload
+; CHECK-NEXT: subs x30, x30, x28
+; CHECK-NEXT: stur x30, [x29, #-200] // 8-byte Folded Spill
+; CHECK-NEXT: mov sp, x30
; CHECK-NEXT: mov x30, sp
-; CHECK-NEXT: subs x27, x30, x28
-; CHECK-NEXT: stur x27, [x29, #-200] // 8-byte Folded Spill
-; CHECK-NEXT: mov sp, x27
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: stur x27, [x29, #-240] // 8-byte Folded Spill
-; CHECK-NEXT: subs x30, x27, #16
+; CHECK-NEXT: subs x28, x30, x28
+; CHECK-NEXT: stur x28, [x29, #-208] // 8-byte Folded Spill
+; CHECK-NEXT: mov sp, x28
+; CHECK-NEXT: mov x28, sp
+; CHECK-NEXT: stur x28, [x29, #-248] // 8-byte Folded Spill
+; CHECK-NEXT: subs x30, x28, #16
; CHECK-NEXT: mov sp, x30
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: stur x27, [x29, #-216] // 8-byte Folded Spill
-; CHECK-NEXT: subs x30, x27, #16
+; CHECK-NEXT: mov x28, sp
+; CHECK-NEXT: stur x28, [x29, #-224] // 8-byte Folded Spill
+; CHECK-NEXT: subs x30, x28, #16
; CHECK-NEXT: mov sp, x30
-; CHECK-NEXT: mov x27, sp
-; CHECK-NEXT: stur x27, [x29, #-232] // 8-byte Folded Spill
-; CHECK-NEXT: subs x30, x27, #16
+; CHECK-NEXT: mov x28, sp
+; CHECK-NEXT: stur x28, [x29, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: subs x30, x28, #16
; CHECK-NEXT: mov sp, x30
-; CHECK-NEXT: ldur x27, [x29, #-224] // 8-byte Folded Reload
-; CHECK-NEXT: sturb wzr, [x27, #-16]
-; CHECK-NEXT: ldur x27, [x29, #-248] // 8-byte Folded Reload
-; CHECK-NEXT: sturb wzr, [x27, #-16]
-; CHECK-NEXT: sub x27, x29, #80
-; CHECK-NEXT: ldur x27, [x27, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: sturb wzr, [x27, #-16]
-; CHECK-NEXT: ldur x27, [x29, #-208] // 8-byte Folded Reload
-; CHECK-NEXT: stur xzr, [x27, #-16]
-; CHECK-NEXT: ldur x27, [x29, #-128] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x30, [x27, #-16]
-; CHECK-NEXT: ldur x27, [x29, #-160] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x28, [x27, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-232] // 8-byte Folded Reload
+; CHECK-NEXT: sturb wzr, [x28, #-16]
+; CHECK-NEXT: stur x8, [x29, #-8]
+; CHECK-NEXT: sub x8, x29, #16
+; CHECK-NEXT: ldur x28, [x8, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: sturb wzr, [x28, #-16]
+; CHECK-NEXT: sub x8, x29, #88
+; CHECK-NEXT: ldur x28, [x8, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: sturb wzr, [x28, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-216] // 8-byte Folded Reload
+; CHECK-NEXT: stur xzr, [x28, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-144] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x30, [x28, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-128] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x28, #-16]
+; CHECK-NEXT: add x28, x30, x28, lsl #2
+; CHECK-NEXT: sub x8, x29, #136
+; CHECK-NEXT: stur x28, [x8, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: ldur x28, [x29, #-80] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x30, [x28, #-16]
+; CHECK-NEXT: subs x30, x30, #1
+; CHECK-NEXT: ldur x28, [x29, #-88] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x28, #-16]
+; CHECK-NEXT: mul x28, x30, x28
+; CHECK-NEXT: sub x8, x29, #136
+; CHECK-NEXT: ldur x30, [x8, #-256] // 8-byte Folded Reload
; CHECK-NEXT: add x28, x30, x28, lsl #2
-; CHECK-NEXT: ldur x27, [x29, #-80] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x30, [x27, #-16]
+; CHECK-NEXT: ldur x30, [x29, #-216] // 8-byte Folded Reload
+; CHECK-NEXT: stur x28, [x30, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-192] // 8-byte Folded Reload
+; CHECK-NEXT: stur xzr, [x28, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-160] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x28, #-16]
+; CHECK-NEXT: ldur x30, [x29, #-136] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x30, [x30, #-16]
+; CHECK-NEXT: add x28, x28, x30, lsl #2
+; CHECK-NEXT: sub x8, x29, #144
+; CHECK-NEXT: stur x28, [x8, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: ldur x28, [x29, #-80] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x30, [x28, #-16]
; CHECK-NEXT: subs x30, x30, #1
-; CHECK-NEXT: ldur x27, [x29, #-88] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x27, [x27, #-16]
-; CHECK-NEXT: mul x27, x30, x27
-; CHECK-NEXT: add x27, x28, x27, lsl #2
-; CHECK-NEXT: ldur x28, [x29, #-208] // 8-byte Folded Reload
-; CHECK-NEXT: stur x27, [x28, #-16]
-; CHECK-NEXT: ldur x27, [x29, #-184] // 8-byte Folded Reload
-; CHECK-NEXT: stur xzr, [x27, #-16]
-; CHECK-NEXT: ldur x27, [x29, #-144] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x27, [x27, #-16]
; CHECK-NEXT: ldur x28, [x29, #-152] // 8-byte Folded Reload
; CHECK-NEXT: ldur x28, [x28, #-16]
-; CHECK-NEXT: add x27, x27, x28, lsl #2
-; CHECK-NEXT: ldur x28, [x29, #-80] // 8-byte Folded Reload
+; CHECK-NEXT: mul x28, x30, x28
+; CHECK-NEXT: sub x8, x29, #144
+; CHECK-NEXT: ldur x30, [x8, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: add x28, x30, x28, lsl #2
+; CHECK-NEXT: ldur x30, [x29, #-192] // 8-byte Folded Reload
+; CHECK-NEXT: stur x28, [x30, #-16]
+; CHECK-NEXT: sub x8, x29, #48
+; CHECK-NEXT: ldur x28, [x8, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur xzr, [x28, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-168] // 8-byte Folded Reload
; CHECK-NEXT: ldur x28, [x28, #-16]
-; CHECK-NEXT: subs x28, x28, #1
-; CHECK-NEXT: ldur x30, [x29, #-136] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x30, [x29, #-128] // 8-byte Folded Reload
; CHECK-NEXT: ldur x30, [x30, #-16]
-; CHECK-NEXT: mul x28, x28, x30
-; CHECK-NEXT: add x27, x27, x28, lsl #2
-; CHECK-NEXT: ldur x28, [x29, #-184] // 8-byte Folded Reload
-; CHECK-NEXT: stur x27, [x28, #-16]
-; CHECK-NEXT: sub x27, x29, #32
-; CHECK-NEXT: ldur x27, [x27, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: stur xzr, [x27, #-16]
-; CHECK-NEXT: ldur x27, [x29, #-168] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x27, [x27, #-16]
-; CHECK-NEXT: ldur x28, [x29, #-160] // 8-byte Folded Reload
+; CHECK-NEXT: add x28, x28, x30, lsl #2
+; CHECK-NEXT: sub x8, x29, #152
+; CHECK-NEXT: stur x28, [x8, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: ldur x28, [x29, #-136] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x30, [x28, #-16]
+; CHECK-NEXT: subs x30, x30, #1
+; CHECK-NEXT: ldur x28, [x29, #-176] // 8-byte Folded Reload
; CHECK-NEXT: ldur x28, [x28, #-16]
-; CHECK-NEXT: add x27, x27, x28, lsl #2
-; CHECK-NEXT: ldur x28, [x29, #-152] // 8-byte Folded Reload
+; CHECK-NEXT: mul x28, x30, x28
+; CHECK-NEXT: sub x8, x29, #152
+; CHECK-NEXT: ldur x30, [x8, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: add x28, x30, x28, lsl #2
+; CHECK-NEXT: sub x8, x29, #48
+; CHECK-NEXT: ldur x30, [x8, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur x28, [x30, #-16]
+; CHECK-NEXT: mov w30, #32 // =0x20
+; CHECK-NEXT: sub x8, x29, #56
+; CHECK-NEXT: ldur x28, [x8, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur x30, [x28, #-16]
+; CHECK-NEXT: sub x8, x29, #80
+; CHECK-NEXT: ldur x28, [x8, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur x30, [x28, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-240] // 8-byte Folded Reload
+; CHECK-NEXT: stur xzr, [x28, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-80] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x30, [x28, #-16]
+; CHECK-NEXT: lsl x30, x30, #5
+; CHECK-NEXT: ldur x28, [x29, #-240] // 8-byte Folded Reload
+; CHECK-NEXT: stur x30, [x28, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-88] // 8-byte Folded Reload
+; CHECK-NEXT: mov w30, #32 // =0x20
+; CHECK-NEXT: stur x30, [x28, #-16]
+; CHECK-NEXT: sub x8, x29, #72
+; CHECK-NEXT: ldur x28, [x8, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur xzr, [x28, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-128] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x30, [x28, #-16]
+; CHECK-NEXT: subs x30, x30, #1
+; CHECK-NEXT: lsr x30, x30, #5
+; CHECK-NEXT: add x30, x30, #1
+; CHECK-NEXT: sub x8, x29, #72
+; CHECK-NEXT: ldur x28, [x8, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur x30, [x28, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-144] // 8-byte Folded Reload
; CHECK-NEXT: ldur x28, [x28, #-16]
-; CHECK-NEXT: subs x28, x28, #1
-; CHECK-NEXT: ldur x30, [x29, #-176] // 8-byte Folded Reload
+; CHECK-NEXT: sub x8, x29, #160
+; CHECK-NEXT: stur x28, [x8, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: sub x8, x29, #72
+; CHECK-NEXT: ldur x28, [x8, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x28, #-16]
+; CHECK-NEXT: ldur x30, [x29, #-240] // 8-byte Folded Reload
; CHECK-NEXT: ldur x30, [x30, #-16]
; CHECK-NEXT: mul x28, x28, x30
-; CHECK-NEXT: add x27, x27, x28, lsl #2
-; CHECK-NEXT: sub x28, x29, #32
-; CHECK-NEXT: ldur x28, [x28, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: stur x27, [x28, #-16]
-; CHECK-NEXT: mov w28, #32 // =0x20
-; CHECK-NEXT: sub x27, x29, #40
-; CHECK-NEXT: ldur x27, [x27, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: stur x28, [x27, #-16]
-; CHECK-NEXT: sub x27, x29, #56
-; CHECK-NEXT: ldur x27, [x27, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: stur x28, [x27, #-16]
-; CHECK-NEXT: sub x27, x29, #64
-; CHECK-NEXT: ldur x30, [x27, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: stur xzr, [x30, #-16]
-; CHECK-NEXT: ldur x27, [x29, #-80] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x28, [x27, #-16]
-; CHECK-NEXT: lsl x28, x28, #5
+; CHECK-NEXT: sub x8, x29, #160
+; CHECK-NEXT: ldur x30, [x8, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: add x28, x30, x28, lsl #2
+; CHECK-NEXT: ldur x30, [x29, #-216] // 8-byte Folded Reload
; CHECK-NEXT: stur x28, [x30, #-16]
-; CHECK-NEXT: ldur x27, [x29, #-88] // 8-byte Folded Reload
-; CHECK-NEXT: mov w28, #32 // =0x20
-; CHECK-NEXT: stur x28, [x27, #-16]
-; CHECK-NEXT: sub x27, x29, #72
-; CHECK-NEXT: ldur x30, [x27, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x29, #-152] // 8-byte Folded Reload
+; CHECK-NEXT: mov w30, #32 // =0x20
+; CHECK-NEXT: stur x30, [x28, #-16]
+; CHECK-NEXT: sub x8, x29, #64
+; CHECK-NEXT: ldur x30, [x8, #-256] // 8-byte Folded Reload
; CHECK-NEXT: stur xzr, [x30, #-16]
-; CHECK-NEXT: ldur x27, [x29, #-160] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x28, [x27, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-136] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x28, #-16]
; CHECK-NEXT: subs x28, x28, #1
; CHECK-NEXT: lsr x28, x28, #5
; CHECK-NEXT: add x28, x28, #1
; CHECK-NEXT: stur x28, [x30, #-16]
-; CHECK-NEXT: ldur x27, [x29, #-128] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x27, [x27, #-16]
-; CHECK-NEXT: sub x28, x29, #120
-; CHECK-NEXT: stur x27, [x28, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: ldur x28, [x30, #-16]
-; CHECK-NEXT: sub x27, x29, #64
-; CHECK-NEXT: ldur x30, [x27, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x27, [x30, #-16]
-; CHECK-NEXT: mul x27, x28, x27
-; CHECK-NEXT: sub x28, x29, #120
-; CHECK-NEXT: ldur x28, [x28, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: add x27, x28, x27, lsl #2
-; CHECK-NEXT: ldur x28, [x29, #-208] // 8-byte Folded Reload
-; CHECK-NEXT: stur x27, [x28, #-16]
-; CHECK-NEXT: ldur x27, [x29, #-136] // 8-byte Folded Reload
-; CHECK-NEXT: mov w28, #32 // =0x20
-; CHECK-NEXT: stur x28, [x27, #-16]
-; CHECK-NEXT: sub x27, x29, #48
-; CHECK-NEXT: ldur x28, [x27, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: stur xzr, [x28, #-16]
-; CHECK-NEXT: ldur x27, [x29, #-152] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x27, [x27, #-16]
-; CHECK-NEXT: subs x27, x27, #1
-; CHECK-NEXT: lsr x27, x27, #5
-; CHECK-NEXT: add x27, x27, #1
-; CHECK-NEXT: stur x27, [x28, #-16]
-; CHECK-NEXT: ldur x27, [x29, #-144] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x27, [x27, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-160] // 8-byte Folded Reload
; CHECK-NEXT: ldur x28, [x28, #-16]
+; CHECK-NEXT: sub x8, x29, #168
+; CHECK-NEXT: stur x28, [x8, #-256] // 8-byte Folded Spill
; CHECK-NEXT: ldur x30, [x30, #-16]
-; CHECK-NEXT: mul x28, x28, x30
-; CHECK-NEXT: add x27, x27, x28, lsl #2
-; CHECK-NEXT: ldur x28, [x29, #-184] // 8-byte Folded Reload
-; CHECK-NEXT: stur x27, [x28, #-16]
-; CHECK-NEXT: sub x27, x29, #8
-; CHECK-NEXT: ldur x27, [x27, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: stur xzr, [x27, #-16]
-; CHECK-NEXT: stur xzr, [x27, #-16]
-; CHECK-NEXT: sub x27, x29, #16
-; CHECK-NEXT: ldur x28, [x27, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x29, #-240] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x28, #-16]
+; CHECK-NEXT: mul x28, x30, x28
+; CHECK-NEXT: sub x8, x29, #168
+; CHECK-NEXT: ldur x30, [x8, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: add x28, x30, x28, lsl #2
+; CHECK-NEXT: ldur x30, [x29, #-192] // 8-byte Folded Reload
+; CHECK-NEXT: stur x28, [x30, #-16]
+; CHECK-NEXT: sub x8, x29, #32
+; CHECK-NEXT: ldur x28, [x8, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur xzr, [x28, #-16]
; CHECK-NEXT: stur xzr, [x28, #-16]
+; CHECK-NEXT: sub x8, x29, #40
+; CHECK-NEXT: ldur x28, [x8, #-256] // 8-byte Folded Reload
; CHECK-NEXT: stur xzr, [x28, #-16]
-; CHECK-NEXT: ldur x27, [x29, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: stur xzr, [x27, #-16]
-; CHECK-NEXT: ldur x27, [x29, #-160] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x27, [x27, #-16]
+; CHECK-NEXT: stur xzr, [x28, #-16]
+; CHECK-NEXT: sub x8, x29, #24
+; CHECK-NEXT: ldur x28, [x8, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur xzr, [x28, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-128] // 8-byte Folded Reload
; CHECK-NEXT: ldur x28, [x28, #-16]
-; CHECK-NEXT: subs x27, x27, x28
-; CHECK-NEXT: ldur x28, [x29, #-96] // 8-byte Folded Reload
-; CHECK-NEXT: stur x27, [x28, #-16]
-; CHECK-NEXT: ldur x27, [x29, #-112] // 8-byte Folded Reload
+; CHECK-NEXT: sub x8, x29, #40
+; CHECK-NEXT: ldur x30, [x8, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x30, [x30, #-16]
+; CHECK-NEXT: subs x28, x28, x30
+; CHECK-NEXT: ldur x30, [x29, #-96] // 8-byte Folded Reload
+; CHECK-NEXT: stur x28, [x30, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-112] // 8-byte Folded Reload
; CHECK-NEXT: mov w30, #32 // =0x20
-; CHECK-NEXT: stur x30, [x27, #-16]
-; CHECK-NEXT: ldur x27, [x28, #-16]
-; CHECK-NEXT: ldur x28, [x29, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: stur x27, [x28, #-16]
-; CHECK-NEXT: sub x27, x29, #24
-; CHECK-NEXT: ldur x27, [x27, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: stur xzr, [x27, #-16]
-; CHECK-NEXT: ldur x27, [x29, #-152] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x27, [x27, #-16]
-; CHECK-NEXT: sub x28, x29, #8
-; CHECK-NEXT: ldur x28, [x28, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur x30, [x28, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-96] // 8-byte Folded Reload
; CHECK-NEXT: ldur x28, [x28, #-16]
-; CHECK-NEXT: subs x27, x27, x28
-; CHECK-NEXT: ldur x28, [x29, #-104] // 8-byte Folded Reload
-; CHECK-NEXT: stur x27, [x28, #-16]
-; CHECK-NEXT: ldur x27, [x29, #-120] // 8-byte Folded Reload
-; CHECK-NEXT: mov w30, #32 // =0x20
-; CHECK-NEXT: stur x30, [x27, #-16]
-; CHECK-NEXT: ldur x27, [x28, #-16]
-; CHECK-NEXT: sub x28, x29, #24
-; CHECK-NEXT: ldur x28, [x28, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: stur x27, [x28, #-16]
-; CHECK-NEXT: sub x27, x29, #16
-; CHECK-NEXT: ldur x27, [x27, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x27, [x27, #-16]
-; CHECK-NEXT: ldur x28, [x29, #-160] // 8-byte Folded Reload
+; CHECK-NEXT: sub x8, x29, #24
+; CHECK-NEXT: ldur x30, [x8, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur x28, [x30, #-16]
+; CHECK-NEXT: sub x8, x29, #8
+; CHECK-NEXT: ldur x28, [x8, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur xzr, [x28, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-136] // 8-byte Folded Reload
; CHECK-NEXT: ldur x28, [x28, #-16]
-; CHECK-NEXT: whilelt pn8.s, x27, x28, vlx2
-; CHECK-NEXT: ldur x27, [x29, #-192] // 8-byte Folded Reload
-; CHECK-NEXT: str pn8, [x27]
-; CHECK-NEXT: sub x27, x29, #8
-; CHECK-NEXT: ldur x30, [x27, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x27, [x30, #-16]
-; CHECK-NEXT: ldur x28, [x29, #-152] // 8-byte Folded Reload
+; CHECK-NEXT: sub x8, x29, #32
+; CHECK-NEXT: ldur x30, [x8, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x30, [x30, #-16]
+; CHECK-NEXT: subs x28, x28, x30
+; CHECK-NEXT: ldur x30, [x29, #-104] // 8-byte Folded Reload
+; CHECK-NEXT: stur x28, [x30, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-120] // 8-byte Folded Reload
+; CHECK-NEXT: mov w30, #32 // =0x20
+; CHECK-NEXT: stur x30, [x28, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-104] // 8-byte Folded Reload
; CHECK-NEXT: ldur x28, [x28, #-16]
-; CHECK-NEXT: whilelt pn8.s, x27, x28, vlx2
-; CHECK-NEXT: ldur x27, [x29, #-200] // 8-byte Folded Reload
-; CHECK-NEXT: str pn8, [x27]
-; CHECK-NEXT: ldur x27, [x29, #-240] // 8-byte Folded Reload
-; CHECK-NEXT: stur xzr, [x27, #-16]
-; CHECK-NEXT: ldur x27, [x29, #-168] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x27, [x27, #-16]
-; CHECK-NEXT: sub x28, x29, #16
-; CHECK-NEXT: ldur x28, [x28, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: sub x8, x29, #8
+; CHECK-NEXT: ldur x30, [x8, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur x28, [x30, #-16]
+; CHECK-NEXT: sub x8, x29, #40
+; CHECK-NEXT: ldur x28, [x8, #-256] // 8-byte Folded Reload
; CHECK-NEXT: ldur x28, [x28, #-16]
-; CHECK-NEXT: add x27, x27, x28, lsl #2
-; CHECK-NEXT: ldur x28, [x30, #-16]
-; CHECK-NEXT: ldur x30, [x29, #-176] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x30, [x29, #-128] // 8-byte Folded Reload
; CHECK-NEXT: ldur x30, [x30, #-16]
-; CHECK-NEXT: mul x28, x28, x30
-; CHECK-NEXT: add x27, x27, x28, lsl #2
-; CHECK-NEXT: ldur x28, [x29, #-240] // 8-byte Folded Reload
-; CHECK-NEXT: stur x27, [x28, #-16]
-; CHECK-NEXT: zero {za}
-; CHECK-NEXT: ldur x27, [x29, #-216] // 8-byte Folded Reload
-; CHECK-NEXT: stur xzr, [x27, #-16]
-; CHECK-NEXT: ldur x27, [x29, #-128] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x27, [x27, #-16]
-; CHECK-NEXT: sub x28, x29, #16
-; CHECK-NEXT: ldur x28, [x28, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: whilelt pn8.s, x28, x30, vlx2
+; CHECK-NEXT: ldur x28, [x29, #-200] // 8-byte Folded Reload
+; CHECK-NEXT: str pn8, [x28]
+; CHECK-NEXT: sub x8, x29, #32
+; CHECK-NEXT: ldur x28, [x8, #-256] // 8-byte Folded Reload
; CHECK-NEXT: ldur x28, [x28, #-16]
-; CHECK-NEXT: ldur x30, [x29, #-80] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x30, [x29, #-136] // 8-byte Folded Reload
; CHECK-NEXT: ldur x30, [x30, #-16]
-; CHECK-NEXT: mul x28, x28, x30
-; CHECK-NEXT: add x27, x27, x28, lsl #2
-; CHECK-NEXT: ldur x28, [x29, #-216] // 8-byte Folded Reload
-; CHECK-NEXT: stur x27, [x28, #-16]
-; CHECK-NEXT: ldur x27, [x29, #-232] // 8-byte Folded Reload
-; CHECK-NEXT: stur xzr, [x27, #-16]
-; CHECK-NEXT: ldur x27, [x29, #-144] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x27, [x27, #-16]
-; CHECK-NEXT: sub x28, x29, #8
-; CHECK-NEXT: ldur x28, [x28, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: whilelt pn8.s, x28, x30, vlx2
+; CHECK-NEXT: ldur x28, [x29, #-208] // 8-byte Folded Reload
+; CHECK-NEXT: str pn8, [x28]
+; CHECK-NEXT: ldur x28, [x29, #-248] // 8-byte Folded Reload
+; CHECK-NEXT: stur xzr, [x28, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-168] // 8-byte Folded Reload
; CHECK-NEXT: ldur x28, [x28, #-16]
-; CHECK-NEXT: ldur x30, [x29, #-80] // 8-byte Folded Reload
+; CHECK-NEXT: sub x8, x29, #40
+; CHECK-NEXT: ldur x30, [x8, #-256] // 8-byte Folded Reload
; CHECK-NEXT: ldur x30, [x30, #-16]
-; CHECK-NEXT: mul x28, x28, x30
-; CHECK-NEXT: add x27, x27, x28, lsl #2
-; CHECK-NEXT: ldur x28, [x29, #-232] // 8-byte Folded Reload
-; CHECK-NEXT: stur x27, [x28, #-16]
-; CHECK-NEXT: ldur x27, [x29, #-80] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x27, [x27, #-16]
-; CHECK-NEXT: sub x30, x29, #88
-; CHECK-NEXT: stur x27, [x30, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: ldur x27, [x29, #-192] // 8-byte Folded Reload
-; CHECK-NEXT: ldr p0, [x27]
-; CHECK-NEXT: ldur x27, [x29, #-216] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x27, [x27, #-16]
-; CHECK-NEXT: sub x30, x29, #96
-; CHECK-NEXT: stur x27, [x30, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: ldur x27, [x29, #-88] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x30, [x27, #-16]
-; CHECK-NEXT: lsr x27, x30, #2
-; CHECK-NEXT: sub x30, x29, #104
-; CHECK-NEXT: stur x27, [x30, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: ldur x27, [x29, #-200] // 8-byte Folded Reload
-; CHECK-NEXT: ldr p1, [x27]
-; CHECK-NEXT: ldur x27, [x28, #-16]
-; CHECK-NEXT: sub x28, x29, #112
-; CHECK-NEXT: stur x27, [x28, #-256] // 8-byte Folded Spill
-; CHECK-NEXT: ldur x27, [x29, #-136] // 8-byte Folded Reload
-; CHECK-NEXT: ldur x28, [x27, #-16]
+; CHECK-NEXT: add x28, x28, x30, lsl #2
+; CHECK-NEXT: sub x8, x29, #176
+; CHECK-NEXT: stur x28, [x8, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: sub x8, x29, #32
+; CHECK-NEXT: ldur x28, [x8, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x30, [x28, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-176] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x28, #-16]
+; CHECK-NEXT: mul x28, x30, x28
+; CHECK-NEXT: sub x8, x29, #176
+; CHECK-NEXT: ldur x30, [x8, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: add x28, x30, x28, lsl #2
+; CHECK-NEXT: ldur x30, [x29, #-248] // 8-byte Folded Reload
+; CHECK-NEXT: stur x28, [x30, #-16]
+; CHECK-NEXT: zero {za}
+; CHECK-NEXT: ldur x28, [x29, #-224] // 8-byte Folded Reload
+; CHECK-NEXT: stur xzr, [x28, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-144] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x28, #-16]
+; CHECK-NEXT: sub x8, x29, #200
+; CHECK-NEXT: stur x28, [x8, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: sub x8, x29, #40
+; CHECK-NEXT: ldur x28, [x8, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x28, #-16]
+; CHECK-NEXT: sub x8, x29, #192
+; CHECK-NEXT: stur x28, [x8, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: ldur x28, [x29, #-80] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x28, #-16]
+; CHECK-NEXT: sub x8, x29, #192
+; CHECK-NEXT: ldur x30, [x8, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: mul x28, x30, x28
+; CHECK-NEXT: sub x8, x29, #200
+; CHECK-NEXT: ldur x30, [x8, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: add x28, x30, x28, lsl #2
+; CHECK-NEXT: ldur x30, [x29, #-224] // 8-byte Folded Reload
+; CHECK-NEXT: stur x28, [x30, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur xzr, [x28, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-160] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x28, #-16]
+; CHECK-NEXT: sub x8, x29, #184
+; CHECK-NEXT: stur x28, [x8, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: sub x8, x29, #32
+; CHECK-NEXT: ldur x28, [x8, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x30, [x28, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-80] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x28, #-16]
+; CHECK-NEXT: mul x28, x30, x28
+; CHECK-NEXT: sub x8, x29, #184
+; CHECK-NEXT: ldur x30, [x8, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: add x28, x30, x28, lsl #2
+; CHECK-NEXT: ldur x30, [x29, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur x28, [x30, #-16]
+; CHECK-NEXT: ldur x28, [x29, #-80] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x28, #-16]
+; CHECK-NEXT: sub x8, x29, #96
+; CHECK-NEXT: stur x28, [x8, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: ldur x28, [x29, #-200] // 8-byte Folded Reload
+; CHECK-NEXT: ldr p0, [x28]
+; CHECK-NEXT: ldur x28, [x29, #-224] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x28, #-16]
+; CHECK-NEXT: sub x8, x29, #104
+; CHECK-NEXT: stur x28, [x8, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: ldur x28, [x29, #-88] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x28, #-16]
; CHECK-NEXT: lsr x28, x28, #2
-; CHECK-NEXT: ldur x27, [x29, #-40] // 8-byte Folded Reload
-; CHECK-NEXT: sub x30, x29, #88
-; CHECK-NEXT: ldur x30, [x30, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: stur x30, [x27, #-16]
+; CHECK-NEXT: sub x8, x29, #112
+; CHECK-NEXT: stur x28, [x8, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: ldur x8, [x29, #-8]
+; CHECK-NEXT: ldur x28, [x29, #-208] // 8-byte Folded Reload
+; CHECK-NEXT: ldr p1, [x28]
+; CHECK-NEXT: ldur x28, [x30, #-16]
+; CHECK-NEXT: sub x30, x29, #120
+; CHECK-NEXT: stur x28, [x30, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: ldur x28, [x29, #-152] // 8-byte Folded Reload
+; CHECK-NEXT: ldur x28, [x28, #-16]
+; CHECK-NEXT: lsr x28, x28, #2
+; CHECK-NEXT: sub x30, x29, #128
+; CHECK-NEXT: stur x28, [x30, #-256] // 8-byte Folded Spill
+; CHECK-NEXT: ldur x30, [x29, #-40] // 8-byte Folded Reload
+; CHECK-NEXT: sub x28, x29, #96
+; CHECK-NEXT: ldur x28, [x28, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur x28, [x30, #-16]
; CHECK-NEXT: str p0, [x6]
-; CHECK-NEXT: ldur x27, [x29, #-48] // 8-byte Folded Reload
-; CHECK-NEXT: sub x30, x29, #96
-; CHECK-NEXT: ldur x30, [x30, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: stur x30, [x27, #-16]
-; CHECK-NEXT: ldur x27, [x29, #-56] // 8-byte Folded Reload
-; CHECK-NEXT: sub x30, x29, #104
-; CHECK-NEXT: ldur x30, [x30, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: stur x30, [x27, #-16]
-; CHECK-NEXT: str p1, [x20]
-; CHECK-NEXT: ldur x27, [x29, #-64] // 8-byte Folded Reload
-; CHECK-NEXT: sub x30, x29, #112
-; CHECK-NEXT: ldur x30, [x30, #-256] // 8-byte Folded Reload
-; CHECK-NEXT: stur x30, [x27, #-16]
-; CHECK-NEXT: ldur x27, [x29, #-72] // 8-byte Folded Reload
-; CHECK-NEXT: stur x28, [x27, #-16]
+; CHECK-NEXT: ldur x30, [x29, #-48] // 8-byte Folded Reload
+; CHECK-NEXT: sub x28, x29, #104
+; CHECK-NEXT: ldur x28, [x28, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur x28, [x30, #-16]
+; CHECK-NEXT: ldur x30, [x29, #-56] // 8-byte Folded Reload
+; CHECK-NEXT: sub x28, x29, #112
+; CHECK-NEXT: ldur x28, [x28, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur x28, [x30, #-16]
+; CHECK-NEXT: str p1, [x21]
+; CHECK-NEXT: ldur x30, [x29, #-64] // 8-byte Folded Reload
+; CHECK-NEXT: sub x28, x29, #120
+; CHECK-NEXT: ldur x28, [x28, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur x28, [x30, #-16]
+; CHECK-NEXT: ldur x30, [x29, #-72] // 8-byte Folded Reload
+; CHECK-NEXT: sub x28, x29, #128
+; CHECK-NEXT: ldur x28, [x28, #-256] // 8-byte Folded Reload
+; CHECK-NEXT: stur x28, [x30, #-16]
; CHECK-NEXT: ldr p0, [x6]
; CHECK-NEXT: mov p8.b, p0.b
; CHECK-NEXT: pext { p0.s, p1.s }, pn8[0]
; CHECK-NEXT: ptrue p2.s
; CHECK-NEXT: and p3.b, p0/z, p0.b, p2.b
; CHECK-NEXT: and p0.b, p1/z, p1.b, p2.b
-; CHECK-NEXT: mov x27, x23
-; CHECK-NEXT: incd x27
-; CHECK-NEXT: str p0, [x27]
-; CHECK-NEXT: str p3, [x23]
-; CHECK-NEXT: ldr p0, [x20]
+; CHECK-NEXT: mov x28, x24
+; CHECK-NEXT: incd x28
+; CHECK-NEXT: str p0, [x28]
+; CHECK-NEXT: str p3, [x24]
+; CHECK-NEXT: ldr p0, [x21]
; CHECK-NEXT: mov p8.b, p0.b
; CHECK-NEXT: pext { p0.s, p1.s }, pn8[0]
; CHECK-NEXT: and p3.b, p0/z, p0.b, p2.b
; CHECK-NEXT: and p0.b, p1/z, p1.b, p2.b
-; CHECK-NEXT: mov x27, x24
-; CHECK-NEXT: incd x27
-; CHECK-NEXT: str p0, [x27]
-; CHECK-NEXT: str p3, [x24]
+; CHECK-NEXT: mov x28, x25
+; CHECK-NEXT: incd x28
+; CHECK-NEXT: str p0, [x28]
+; CHECK-NEXT: str p3, [x25]
; CHECK-NEXT: b .LBB0_3
; CHECK-NEXT: .LBB0_3: // %bb178
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr p0, [x6]
-; CHECK-NEXT: ldr x27, [x7]
+; CHECK-NEXT: ldr x28, [x7]
; CHECK-NEXT: mov p8.b, p0.b
-; CHECK-NEXT: ld1w { z16.s, z24.s }, pn8/z, [x27]
+; CHECK-NEXT: ld1w { z16.s, z24.s }, pn8/z, [x28]
; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: st1w { z24.s }, p0, [x25, #1, mul vl]
-; CHECK-NEXT: st1w { z16.s }, p0, [x25]
-; CHECK-NEXT: ldr x27, [x19]
-; CHECK-NEXT: ldr x28, [x7]
-; CHECK-NEXT: add x27, x28, x27, lsl #2
-; CHECK-NEXT: str x27, [x7]
-; CHECK-NEXT: ldr p1, [x20]
-; CHECK-NEXT: ldr x27, [x21]
-; CHECK-NEXT: mov p8.b, p1.b
-; CHECK-NEXT: ld1w { z16.s, z24.s }, pn8/z, [x27]
; CHECK-NEXT: st1w { z24.s }, p0, [x26, #1, mul vl]
; CHECK-NEXT: st1w { z16.s }, p0, [x26]
-; CHECK-NEXT: ldr x27, [x22]
-; CHECK-NEXT: ldr x28, [x21]
-; CHECK-NEXT: add x27, x28, x27, lsl #2
-; CHECK-NEXT: str x27, [x21]
-; CHECK-NEXT: mov x27, x23
-; CHECK-NEXT: incd x27
-; CHECK-NEXT: ldr p1, [x23]
+; CHECK-NEXT: ldr x28, [x20]
+; CHECK-NEXT: ldr x30, [x7]
+; CHECK-NEXT: add x28, x30, x28, lsl #2
+; CHECK-NEXT: str x28, [x7]
+; CHECK-NEXT: ldr p1, [x21]
+; CHECK-NEXT: ldr x28, [x22]
+; CHECK-NEXT: mov p8.b, p1.b
+; CHECK-NEXT: ld1w { z16.s, z24.s }, pn8/z, [x28]
+; CHECK-NEXT: st1w { z24.s }, p0, [x27, #1, mul vl]
+; CHECK-NEXT: st1w { z16.s }, p0, [x27]
+; CHECK-NEXT: ldr x28, [x23]
+; CHECK-NEXT: ldr x30, [x22]
+; CHECK-NEXT: add x28, x30, x28, lsl #2
+; CHECK-NEXT: str x28, [x22]
; CHECK-NEXT: mov x28, x24
; CHECK-NEXT: incd x28
-; CHECK-NEXT: ldr p2, [x24]
-; CHECK-NEXT: ld1w { z0.s }, p0/z, [x25]
-; CHECK-NEXT: ld1w { z1.s }, p0/z, [x26]
+; CHECK-NEXT: ldr p1, [x24]
+; CHECK-NEXT: mov x30, x25
+; CHECK-NEXT: incd x30
+; CHECK-NEXT: ldr p2, [x25]
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x26]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x27]
; CHECK-NEXT: str p1, [x1]
; CHECK-NEXT: str p2, [x2]
; CHECK-NEXT: st1w { z0.s }, p0, [x3]
@@ -626,10 +682,10 @@ define void @quux() #1 {
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x3]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x4]
; CHECK-NEXT: fmopa za0.s, p1/m, p2/m, z0.s, z1.s
-; CHECK-NEXT: ldr p1, [x27]
-; CHECK-NEXT: ldr p2, [x24]
-; CHECK-NEXT: ld1w { z0.s }, p0/z, [x25, #1, mul vl]
-; CHECK-NEXT: ld1w { z1.s }, p0/z, [x26]
+; CHECK-NEXT: ldr p1, [x28]
+; CHECK-NEXT: ldr p2, [x25]
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x26, #1, mul vl]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x27]
; CHECK-NEXT: str p1, [x16]
; CHECK-NEXT: str p2, [x17]
; CHECK-NEXT: st1w { z0.s }, p0, [x18]
@@ -639,10 +695,10 @@ define void @quux() #1 {
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x18]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
; CHECK-NEXT: fmopa za1.s, p1/m, p2/m, z0.s, z1.s
-; CHECK-NEXT: ldr p1, [x23]
-; CHECK-NEXT: ldr p2, [x28]
-; CHECK-NEXT: ld1w { z0.s }, p0/z, [x25]
-; CHECK-NEXT: ld1w { z1.s }, p0/z, [x26, #1, mul vl]
+; CHECK-NEXT: ldr p1, [x24]
+; CHECK-NEXT: ldr p2, [x30]
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x26]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x27, #1, mul vl]
; CHECK-NEXT: str p1, [x12]
; CHECK-NEXT: str p2, [x13]
; CHECK-NEXT: st1w { z0.s }, p0, [x14]
@@ -652,10 +708,10 @@ define void @quux() #1 {
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x14]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x15]
; CHECK-NEXT: fmopa za2.s, p1/m, p2/m, z0.s, z1.s
-; CHECK-NEXT: ldr p1, [x27]
-; CHECK-NEXT: ldr p2, [x28]
-; CHECK-NEXT: ld1w { z0.s }, p0/z, [x25, #1, mul vl]
-; CHECK-NEXT: ld1w { z1.s }, p0/z, [x26, #1, mul vl]
+; CHECK-NEXT: ldr p1, [x28]
+; CHECK-NEXT: ldr p2, [x30]
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x26, #1, mul vl]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x27, #1, mul vl]
; CHECK-NEXT: str p1, [x8]
; CHECK-NEXT: str p2, [x9]
; CHECK-NEXT: st1w { z0.s }, p0, [x10]
@@ -665,9 +721,9 @@ define void @quux() #1 {
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x10]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x11]
; CHECK-NEXT: fmopa za3.s, p1/m, p2/m, z0.s, z1.s
-; CHECK-NEXT: ldr x27, [x5]
-; CHECK-NEXT: subs x27, x27, #1
-; CHECK-NEXT: str x27, [x5]
+; CHECK-NEXT: ldr x28, [x5]
+; CHECK-NEXT: subs x28, x28, #1
+; CHECK-NEXT: str x28, [x5]
; CHECK-NEXT: b .LBB0_3
bb:
%alloca = alloca <vscale x 16 x i1>, align 2
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index 9d635f0b88f19..92baf0d223a4e 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -8,7 +8,8 @@ declare float @llvm.cos.f32(float)
define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
; CHECK-LABEL: test_lazy_save_1_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
@@ -31,7 +32,8 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @private_za_callee()
ret void
@@ -41,20 +43,21 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
; CHECK-LABEL: test_lazy_save_2_callees:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill
+; CHECK-NEXT: str x21, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: rdsvl x19, #1
+; CHECK-NEXT: rdsvl x20, #1
; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: msub x8, x19, x19, x8
+; CHECK-NEXT: msub x8, x20, x20, x8
; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: sub x20, x29, #16
+; CHECK-NEXT: sub x21, x29, #16
; CHECK-NEXT: stur wzr, [x29, #-4]
; CHECK-NEXT: sturh wzr, [x29, #-6]
; CHECK-NEXT: stur x8, [x29, #-16]
-; CHECK-NEXT: sturh w19, [x29, #-8]
-; CHECK-NEXT: msr TPIDR2_EL0, x20
+; CHECK-NEXT: sturh w20, [x29, #-8]
+; CHECK-NEXT: msr TPIDR2_EL0, x21
; CHECK-NEXT: bl private_za_callee
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -64,8 +67,8 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
; CHECK-NEXT: bl __arm_tpidr2_restore
; CHECK-NEXT: .LBB1_2:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEXT: sturh w19, [x29, #-8]
-; CHECK-NEXT: msr TPIDR2_EL0, x20
+; CHECK-NEXT: sturh w20, [x29, #-8]
+; CHECK-NEXT: msr TPIDR2_EL0, x21
; CHECK-NEXT: bl private_za_callee
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -76,8 +79,9 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
; CHECK-NEXT: .LBB1_4:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x21, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @private_za_callee()
call void @private_za_callee()
@@ -88,7 +92,8 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inout_za" {
; CHECK-LABEL: test_lazy_save_expanded_intrinsic:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
@@ -111,7 +116,8 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou
; CHECK-NEXT: .LBB2_2:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
%res = call float @llvm.cos.f32(float %a)
ret float %res
@@ -127,7 +133,7 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: add x29, sp, #64
-; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
@@ -140,13 +146,13 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
; CHECK-NEXT: sturh w8, [x29, #-72]
; CHECK-NEXT: msr TPIDR2_EL0, x10
; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: and x19, x0, #0x1
-; CHECK-NEXT: tbz w19, #0, .LBB3_2
+; CHECK-NEXT: and x20, x0, #0x1
+; CHECK-NEXT: tbz w20, #0, .LBB3_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB3_2:
; CHECK-NEXT: bl private_za_callee
-; CHECK-NEXT: tbz w19, #0, .LBB3_4
+; CHECK-NEXT: tbz w20, #0, .LBB3_4
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB3_4:
@@ -159,8 +165,8 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
; CHECK-NEXT: .LBB3_6:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: sub sp, x29, #64
+; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
index cd7460b177c4b..095e84cda1085 100644
--- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
@@ -7,7 +7,8 @@ declare void @private_za_callee()
define void @disable_tailcallopt() "aarch64_inout_za" nounwind {
; CHECK-LABEL: disable_tailcallopt:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
@@ -30,7 +31,8 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind {
; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
tail call void @private_za_callee()
ret void
@@ -40,7 +42,8 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind {
define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
; CHECK-LABEL: f128_call_za:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
@@ -63,7 +66,8 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
; CHECK-NEXT: .LBB1_2:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
%res = fadd fp128 %a, %b
ret fp128 %res
diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
index 7f40b5e7e1344..884096743e034 100644
--- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
+++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
@@ -34,7 +34,7 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_
; CHECK-LABEL: za_zt0_shared_caller_no_state_callee:
; CHECK: // %bb.0:
; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #80
; CHECK-NEXT: rdsvl x8, #1
@@ -42,16 +42,16 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_
; CHECK-NEXT: msub x9, x8, x8, x9
; CHECK-NEXT: mov sp, x9
; CHECK-NEXT: sub x10, x29, #16
-; CHECK-NEXT: sub x19, x29, #80
+; CHECK-NEXT: sub x20, x29, #80
; CHECK-NEXT: stur wzr, [x29, #-4]
; CHECK-NEXT: sturh wzr, [x29, #-6]
; CHECK-NEXT: stur x9, [x29, #-16]
; CHECK-NEXT: sturh w8, [x29, #-8]
; CHECK-NEXT: msr TPIDR2_EL0, x10
-; CHECK-NEXT: str zt0, [x19]
+; CHECK-NEXT: str zt0, [x20]
; CHECK-NEXT: bl callee
; CHECK-NEXT: smstart za
-; CHECK-NEXT: ldr zt0, [x19]
+; CHECK-NEXT: ldr zt0, [x20]
; CHECK-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEXT: sub x0, x29, #16
; CHECK-NEXT: cbnz x8, .LBB1_2
@@ -60,7 +60,7 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_
; CHECK-NEXT: .LBB1_2:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @callee();
@@ -88,22 +88,22 @@ define void @za_zt0_shared_caller_za_shared_callee() "aarch64_inout_za" "aarch64
; CHECK-LABEL: za_zt0_shared_caller_za_shared_callee:
; CHECK: // %bb.0:
; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #80
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: msub x8, x8, x8, x9
; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: sub x19, x29, #80
+; CHECK-NEXT: sub x20, x29, #80
; CHECK-NEXT: stur wzr, [x29, #-4]
; CHECK-NEXT: sturh wzr, [x29, #-6]
; CHECK-NEXT: stur x8, [x29, #-16]
-; CHECK-NEXT: str zt0, [x19]
+; CHECK-NEXT: str zt0, [x20]
; CHECK-NEXT: bl callee
-; CHECK-NEXT: ldr zt0, [x19]
+; CHECK-NEXT: ldr zt0, [x20]
; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @callee() "aarch64_inout_za";
@@ -114,7 +114,8 @@ define void @za_zt0_shared_caller_za_shared_callee() "aarch64_inout_za" "aarch64
define void @za_zt0_shared_caller_za_zt0_shared_callee() "aarch64_inout_za" "aarch64_in_zt0" nounwind {
; CHECK-LABEL: za_zt0_shared_caller_za_zt0_shared_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
@@ -126,7 +127,8 @@ define void @za_zt0_shared_caller_za_zt0_shared_callee() "aarch64_inout_za" "aar
; CHECK-NEXT: stur x8, [x29, #-16]
; CHECK-NEXT: bl callee
; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @callee() "aarch64_inout_za" "aarch64_in_zt0";
ret void;
@@ -192,7 +194,8 @@ define void @zt0_new_caller() "aarch64_new_zt0" nounwind {
define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind {
; CHECK-LABEL: new_za_zt0_caller:
; CHECK: // %bb.0: // %prelude
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #80
; CHECK-NEXT: rdsvl x8, #1
@@ -217,7 +220,8 @@ define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind {
; CHECK-NEXT: bl callee
; CHECK-NEXT: smstop za
; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @callee() "aarch64_inout_za" "aarch64_in_zt0";
ret void;
@@ -227,7 +231,8 @@ define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind {
define void @new_za_shared_zt0_caller() "aarch64_new_za" "aarch64_in_zt0" nounwind {
; CHECK-LABEL: new_za_shared_zt0_caller:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
@@ -240,7 +245,8 @@ define void @new_za_shared_zt0_caller() "aarch64_new_za" "aarch64_in_zt0" nounwi
; CHECK-NEXT: zero {za}
; CHECK-NEXT: bl callee
; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @callee() "aarch64_inout_za" "aarch64_in_zt0";
ret void;
@@ -250,7 +256,8 @@ define void @new_za_shared_zt0_caller() "aarch64_new_za" "aarch64_in_zt0" nounwi
define void @shared_za_new_zt0() "aarch64_inout_za" "aarch64_new_zt0" nounwind {
; CHECK-LABEL: shared_za_new_zt0:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
@@ -263,7 +270,8 @@ define void @shared_za_new_zt0() "aarch64_inout_za" "aarch64_new_zt0" nounwind {
; CHECK-NEXT: zero { zt0 }
; CHECK-NEXT: bl callee
; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @callee() "aarch64_inout_za" "aarch64_in_zt0";
ret void;
More information about the llvm-commits
mailing list