[llvm] [AArch64]Fix invalid use of ld1/st1 in stack alloc (PR #105518)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 21 06:00:43 PDT 2024
https://github.com/Lukacma created https://github.com/llvm/llvm-project/pull/105518
This patch fixes invalid usage of scalar+immediate variant of ld1/st1 instructions during stack allocation caused by [c4bac7f](https://github.com/llvm/llvm-project/commit/c4bac7f7dcd931a5e561604e95656a24c3d1c9d9). This commit used ld1/st1 even when stack offset was outside of immediate range for this instruction, producing invalid assembly.
>From d7fbd115be1afc57c46973fd5ebd974151cac307 Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Wed, 21 Aug 2024 12:52:02 +0000
Subject: [PATCH] [AArch64]Fix invalid use of ld1/st1 in stack alloc
---
.../Target/AArch64/AArch64FrameLowering.cpp | 54 +-
llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll | 74 +-
.../CodeGen/AArch64/sme2-intrinsics-ld1.ll | 800 +++++++++++-------
.../CodeGen/AArch64/sme2-intrinsics-ldnt1.ll | 800 +++++++++++-------
.../AArch64/sve-callee-save-restore-pairs.ll | 64 +-
5 files changed, 1141 insertions(+), 651 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index e0f8853b715354..e2648ed1da87c0 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -3229,7 +3229,13 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
Alignment = Align(16);
break;
case RegPairInfo::ZPR:
- StrOpc = RPI.isPaired() ? AArch64::ST1B_2Z_IMM : AArch64::STR_ZXI;
+ if (RPI.isPaired())
+ // When stack offset out of range for st1b scalar + imm variant,
+ // use scalar + scalar variant.
+ StrOpc = RPI.Offset >= -8 && RPI.Offset <= 7 ? AArch64::ST1B_2Z_IMM
+ : AArch64::ST1B_2Z;
+ else
+ StrOpc = AArch64::STR_ZXI;
Size = 16;
Alignment = Align(16);
break;
@@ -3354,10 +3360,21 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
MachineMemOperand::MOStore, Size, Alignment));
MIB.addReg(PnReg);
- MIB.addReg(AArch64::SP)
- .addImm(RPI.Offset) // [sp, #offset*scale],
- // where factor*scale is implicit
- .setMIFlag(MachineInstr::FrameSetup);
+ MIB.addReg(AArch64::SP);
+ if (RPI.Offset >= -8 && RPI.Offset <= 7)
+ MIB.addImm(RPI.Offset);
+ else{
+ // When stack offset out of range for st1b scalar + imm variant,
+ // store offset in register for use in scalar + scalar variant.
+ Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB);
+ assert(ScratchReg != AArch64::NoRegister);
+ BuildMI(MBB, std::prev(MI), DL, TII.get(AArch64::MOVZXi), ScratchReg)
+ .addImm(RPI.Offset)
+ .addImm(/*2*scale*/ 5)
+ .setMIFlags(MachineInstr::FrameSetup);
+ MIB.addReg(ScratchReg, RegState::Kill);
+ }
+ MIB.setMIFlag(MachineInstr::FrameSetup);
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
MachineMemOperand::MOStore, Size, Alignment));
@@ -3470,7 +3487,13 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
Alignment = Align(16);
break;
case RegPairInfo::ZPR:
- LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI;
+ if (RPI.isPaired())
+ // When stack offset out of range for st1b scalar + imm variant,
+ // use scalar + scalar variant.
+ LdrOpc = RPI.Offset >= -8 && RPI.Offset <= 7 ? AArch64::LD1B_2Z_IMM
+ : AArch64::LD1B_2Z;
+ else
+ LdrOpc = AArch64::LDR_ZXI;
Size = 16;
Alignment = Align(16);
break;
@@ -3521,10 +3544,21 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
MachineMemOperand::MOLoad, Size, Alignment));
MIB.addReg(PnReg);
- MIB.addReg(AArch64::SP)
- .addImm(RPI.Offset) // [sp, #offset*scale]
- // where factor*scale is implicit
- .setMIFlag(MachineInstr::FrameDestroy);
+ MIB.addReg(AArch64::SP);
+ if (RPI.Offset >= -8 && RPI.Offset <= 7)
+ MIB.addImm(RPI.Offset);
+ else{
+ // When stack offset out of range for ld1b scalar + imm variant,
+ // store offset in register for use in scalar + scalar variant.
+ Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB);
+ assert(ScratchReg != AArch64::NoRegister);
+ BuildMI(MBB, std::prev(MBBI), DL, TII.get(AArch64::MOVZXi), ScratchReg)
+ .addImm(RPI.Offset)
+ .addImm(/*2*scale*/ 5)
+ .setMIFlags(MachineInstr::FrameDestroy);
+ MIB.addReg(ScratchReg, RegState::Kill);
+ }
+ MIB.setMIFlag(MachineInstr::FrameDestroy);
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
MachineMemOperand::MOLoad, Size, Alignment));
diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
index 6264ce0cf4ae6d..5972ab8469b4c0 100644
--- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
+++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
@@ -335,13 +335,8 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
-; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
-; CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
@@ -349,7 +344,7 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 32 - 8 * VG
; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 32 - 16 * VG
; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 32 - 24 * VG
@@ -360,7 +355,17 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 32 - 64 * VG
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 152 * VG
+; CHECK-NEXT: mov x9, #256 // =0x100
; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CHECK-NEXT: mov x9, #320 // =0x140
+; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CHECK-NEXT: mov x9, #384 // =0x180
+; CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CHECK-NEXT: mov x9, #448 // =0x1c0
+; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CHECK-NEXT: mov x9, #512 // =0x200
+; CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CHECK-NEXT: //APP
; CHECK-NEXT: //NO_APP
; CHECK-NEXT: .cfi_offset vg, -16
@@ -369,18 +374,23 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; CHECK-NEXT: bl scalable_callee
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .cfi_restore vg
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 144 * VG
; CHECK-NEXT: ptrue pn8.b
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: mov x9, #256 // =0x100
+; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CHECK-NEXT: mov x9, #320 // =0x140
; CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CHECK-NEXT: mov x9, #384 // =0x180
; CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CHECK-NEXT: mov x9, #448 // =0x1c0
; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CHECK-NEXT: mov x9, #512 // =0x200
+; CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 144 * VG
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -430,13 +440,8 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; FP-CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
@@ -444,7 +449,7 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; FP-CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 48 - 8 * VG
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 48 - 16 * VG
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 48 - 24 * VG
@@ -454,7 +459,17 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 48 - 56 * VG
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 48 - 64 * VG
; FP-CHECK-NEXT: addvl sp, sp, #-1
+; FP-CHECK-NEXT: mov x9, #256 // =0x100
; FP-CHECK-NEXT: str z0, [x29, #-19, mul vl] // 16-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; FP-CHECK-NEXT: mov x9, #320 // =0x140
+; FP-CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; FP-CHECK-NEXT: mov x9, #384 // =0x180
+; FP-CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; FP-CHECK-NEXT: mov x9, #448 // =0x1c0
+; FP-CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; FP-CHECK-NEXT: mov x9, #512 // =0x200
+; FP-CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; FP-CHECK-NEXT: //APP
; FP-CHECK-NEXT: //NO_APP
; FP-CHECK-NEXT: .cfi_offset vg, -32
@@ -463,17 +478,22 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; FP-CHECK-NEXT: bl scalable_callee
; FP-CHECK-NEXT: smstart sm
; FP-CHECK-NEXT: .cfi_restore vg
-; FP-CHECK-NEXT: addvl sp, sp, #1
; FP-CHECK-NEXT: ptrue pn8.b
-; FP-CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: mov x9, #256 // =0x100
+; FP-CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; FP-CHECK-NEXT: mov x9, #320 // =0x140
; FP-CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; FP-CHECK-NEXT: mov x9, #384 // =0x180
; FP-CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; FP-CHECK-NEXT: mov x9, #448 // =0x1c0
; FP-CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; FP-CHECK-NEXT: mov x9, #512 // =0x200
+; FP-CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; FP-CHECK-NEXT: addvl sp, sp, #1
+; FP-CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; FP-CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; FP-CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; FP-CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
index 29d3d68fc4c3de..0152941addad7d 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
@@ -56,29 +56,37 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z1.d, z8.d
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -94,13 +102,18 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
+; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: //APP
@@ -108,16 +121,17 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; CONTIGUOUS-NEXT: //NO_APP
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -185,29 +199,37 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0, x1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z1.d, z8.d
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -223,13 +245,18 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
+; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0, x1]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: //APP
@@ -237,16 +264,17 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: //NO_APP
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -315,29 +343,37 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z1.d, z8.d
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -353,13 +389,18 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
+; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: //APP
@@ -367,16 +408,17 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
; CONTIGUOUS-NEXT: //NO_APP
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -444,29 +486,37 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0, x1, lsl #1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z1.d, z8.d
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -482,13 +532,18 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
+; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0, x1, lsl #1]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: //APP
@@ -496,16 +551,17 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: //NO_APP
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -574,29 +630,37 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ld1w { z0.s, z8.s }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z1.d, z8.d
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -612,13 +676,18 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
+; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ld1w { z0.s, z1.s }, pn8/z, [x0]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: //APP
@@ -626,16 +695,17 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
; CONTIGUOUS-NEXT: //NO_APP
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -703,29 +773,37 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ld1w { z0.s, z8.s }, pn8/z, [x0, x1, lsl #2]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z1.d, z8.d
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -741,13 +819,18 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
+; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ld1w { z0.s, z1.s }, pn8/z, [x0, x1, lsl #2]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: //APP
@@ -755,16 +838,17 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
; CONTIGUOUS-NEXT: //NO_APP
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -833,29 +917,37 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ld1d { z0.d, z8.d }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z1.d, z8.d
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -871,13 +963,18 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
+; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ld1d { z0.d, z1.d }, pn8/z, [x0]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: //APP
@@ -885,16 +982,17 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
; CONTIGUOUS-NEXT: //NO_APP
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -962,29 +1060,37 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ld1d { z0.d, z8.d }, pn8/z, [x0, x1, lsl #3]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z1.d, z8.d
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1000,13 +1106,18 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
+; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ld1d { z0.d, z1.d }, pn8/z, [x0, x1, lsl #3]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: //APP
@@ -1014,16 +1125,17 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: //NO_APP
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1094,30 +1206,38 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z2.d, z8.d
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
@@ -1134,13 +1254,17 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl]
@@ -1152,16 +1276,16 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1235,30 +1359,38 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0, x1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z2.d, z8.d
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
@@ -1275,13 +1407,17 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0, x1]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl]
@@ -1293,16 +1429,16 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1377,30 +1513,38 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ld1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z2.d, z8.d
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
@@ -1417,13 +1561,17 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ld1h { z0.h - z3.h }, pn8/z, [x0]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl]
@@ -1435,16 +1583,16 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1519,30 +1667,38 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ld1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0, x1, lsl #1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z2.d, z8.d
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
@@ -1559,13 +1715,17 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ld1h { z0.h - z3.h }, pn8/z, [x0, x1, lsl #1]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl]
@@ -1577,16 +1737,16 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1661,30 +1821,38 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z2.d, z8.d
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
@@ -1701,13 +1869,17 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ld1w { z0.s - z3.s }, pn8/z, [x0]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl]
@@ -1719,16 +1891,16 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1803,30 +1975,38 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0, x1, lsl #2]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z2.d, z8.d
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
@@ -1843,13 +2023,17 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ld1w { z0.s - z3.s }, pn8/z, [x0, x1, lsl #2]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl]
@@ -1861,16 +2045,16 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1945,30 +2129,38 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ld1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z2.d, z8.d
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
@@ -1985,13 +2177,17 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ld1d { z0.d - z3.d }, pn8/z, [x0]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl]
@@ -2003,16 +2199,16 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2087,30 +2283,38 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ld1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0, x1, lsl #3]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z2.d, z8.d
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
@@ -2127,13 +2331,17 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ld1d { z0.d - z3.d }, pn8/z, [x0, x1, lsl #3]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl]
@@ -2145,16 +2353,16 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
index 3d3748e1011228..7b4170467799b9 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
@@ -9,29 +9,37 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ldnt1b { z0.b, z8.b }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z1.d, z8.d
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -47,13 +55,18 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
+; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ldnt1b { z0.b, z1.b }, pn8/z, [x0]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: //APP
@@ -61,16 +74,17 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
; CONTIGUOUS-NEXT: //NO_APP
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -91,29 +105,37 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ldnt1b { z0.b, z8.b }, pn8/z, [x0, x1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z1.d, z8.d
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -129,13 +151,18 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
+; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ldnt1b { z0.b, z1.b }, pn8/z, [x0, x1]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: //APP
@@ -143,16 +170,17 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: //NO_APP
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -174,29 +202,37 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ldnt1h { z0.h, z8.h }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z1.d, z8.d
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -212,13 +248,18 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
+; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ldnt1h { z0.h, z1.h }, pn8/z, [x0]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: //APP
@@ -226,16 +267,17 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
; CONTIGUOUS-NEXT: //NO_APP
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -256,29 +298,37 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ldnt1h { z0.h, z8.h }, pn8/z, [x0, x1, lsl #1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z1.d, z8.d
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -294,13 +344,18 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
+; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ldnt1h { z0.h, z1.h }, pn8/z, [x0, x1, lsl #1]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: //APP
@@ -308,16 +363,17 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: //NO_APP
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -339,29 +395,37 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ldnt1w { z0.s, z8.s }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z1.d, z8.d
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -377,13 +441,18 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
+; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ldnt1w { z0.s, z1.s }, pn8/z, [x0]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: //APP
@@ -391,16 +460,17 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
; CONTIGUOUS-NEXT: //NO_APP
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -421,29 +491,37 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ldnt1w { z0.s, z8.s }, pn8/z, [x0, x1, lsl #2]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z1.d, z8.d
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -459,13 +537,18 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
+; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ldnt1w { z0.s, z1.s }, pn8/z, [x0, x1, lsl #2]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: //APP
@@ -473,16 +556,17 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: //NO_APP
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -504,29 +588,37 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ldnt1d { z0.d, z8.d }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z1.d, z8.d
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -542,13 +634,18 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
+; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ldnt1d { z0.d, z1.d }, pn8/z, [x0]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: //APP
@@ -556,16 +653,17 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
; CONTIGUOUS-NEXT: //NO_APP
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -586,29 +684,37 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ldnt1d { z0.d, z8.d }, pn8/z, [x0, x1, lsl #3]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z1.d, z8.d
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -624,13 +730,18 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
+; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ldnt1d { z0.d, z1.d }, pn8/z, [x0, x1, lsl #3]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: //APP
@@ -638,16 +749,17 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: //NO_APP
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -669,30 +781,38 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z2.d, z8.d
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
@@ -709,13 +829,17 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ldnt1b { z0.b - z3.b }, pn8/z, [x0]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl]
@@ -727,16 +851,16 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -761,30 +885,38 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0, x1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z2.d, z8.d
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
@@ -801,13 +933,17 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ldnt1b { z0.b - z3.b }, pn8/z, [x0, x1]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl]
@@ -819,16 +955,16 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -854,30 +990,38 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ldnt1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z2.d, z8.d
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
@@ -894,13 +1038,17 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ldnt1h { z0.h - z3.h }, pn8/z, [x0]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl]
@@ -912,16 +1060,16 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -946,30 +1094,38 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ldnt1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0, x1, lsl #1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z2.d, z8.d
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
@@ -986,13 +1142,17 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ldnt1h { z0.h - z3.h }, pn8/z, [x0, x1, lsl #1]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl]
@@ -1004,16 +1164,16 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1039,30 +1199,38 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ldnt1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z2.d, z8.d
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
@@ -1079,13 +1247,17 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ldnt1w { z0.s - z3.s }, pn8/z, [x0]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl]
@@ -1097,16 +1269,16 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1131,30 +1303,38 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ldnt1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0, x1, lsl #2]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z2.d, z8.d
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
@@ -1171,13 +1351,17 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ldnt1w { z0.s - z3.s }, pn8/z, [x0, x1, lsl #2]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl]
@@ -1189,16 +1373,16 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1224,30 +1408,38 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ldnt1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z2.d, z8.d
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
@@ -1264,13 +1456,17 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ldnt1d { z0.d - z3.d }, pn8/z, [x0]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl]
@@ -1282,16 +1478,16 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1316,30 +1512,38 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ldnt1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0, x1, lsl #3]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z2.d, z8.d
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
@@ -1356,13 +1560,17 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: addvl sp, sp, #-4
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ldnt1d { z0.d - z3.d }, pn8/z, [x0, x1, lsl #3]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl]
@@ -1374,16 +1582,16 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #384 // =0x180
; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
index 470c0dd45782c9..cb3b46130d19b5 100644
--- a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
+++ b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
@@ -94,13 +94,8 @@ define void @fbyte(<vscale x 16 x i8> %v) {
; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
-; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
-; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
@@ -108,7 +103,7 @@ define void @fbyte(<vscale x 16 x i8> %v) {
; PAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
; PAIR-NEXT: .cfi_offset w30, -8
; PAIR-NEXT: .cfi_offset w29, -16
@@ -120,18 +115,33 @@ define void @fbyte(<vscale x 16 x i8> %v) {
; PAIR-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
; PAIR-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
; PAIR-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; PAIR-NEXT: mov x9, #256 // =0x100
+; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; PAIR-NEXT: mov x9, #320 // =0x140
+; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; PAIR-NEXT: mov x9, #384 // =0x180
+; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; PAIR-NEXT: mov x9, #448 // =0x1c0
+; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; PAIR-NEXT: mov x9, #512 // =0x200
+; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; PAIR-NEXT: bl my_func
; PAIR-NEXT: ptrue pn8.b
+; PAIR-NEXT: mov x9, #256 // =0x100
; PAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; PAIR-NEXT: mov x9, #320 // =0x140
; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; PAIR-NEXT: mov x9, #384 // =0x180
; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; PAIR-NEXT: mov x9, #448 // =0x1c0
; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; PAIR-NEXT: mov x9, #512 // =0x200
; PAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; PAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; PAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; PAIR-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
@@ -236,13 +246,8 @@ define void @fhalf(<vscale x 8 x half> %v) {
; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
-; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
-; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
@@ -250,7 +255,7 @@ define void @fhalf(<vscale x 8 x half> %v) {
; PAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
; PAIR-NEXT: .cfi_offset w30, -8
; PAIR-NEXT: .cfi_offset w29, -16
@@ -262,18 +267,33 @@ define void @fhalf(<vscale x 8 x half> %v) {
; PAIR-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
; PAIR-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
; PAIR-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; PAIR-NEXT: mov x9, #256 // =0x100
+; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; PAIR-NEXT: mov x9, #320 // =0x140
+; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; PAIR-NEXT: mov x9, #384 // =0x180
+; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; PAIR-NEXT: mov x9, #448 // =0x1c0
+; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; PAIR-NEXT: mov x9, #512 // =0x200
+; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; PAIR-NEXT: bl my_func
; PAIR-NEXT: ptrue pn8.b
+; PAIR-NEXT: mov x9, #256 // =0x100
; PAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; PAIR-NEXT: mov x9, #320 // =0x140
; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; PAIR-NEXT: mov x9, #384 // =0x180
; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; PAIR-NEXT: mov x9, #448 // =0x1c0
; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; PAIR-NEXT: mov x9, #512 // =0x200
; PAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; PAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; PAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; PAIR-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
More information about the llvm-commits
mailing list