[llvm] [AArch64]Fix invalid use of ld1/st1 in stack alloc (PR #105518)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 5 05:31:15 PDT 2024
https://github.com/Lukacma updated https://github.com/llvm/llvm-project/pull/105518
>From 2a92ffdf3a6f9a1e96e456db2fbfc0cc11c5df19 Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Tue, 3 Sep 2024 14:57:29 +0000
Subject: [PATCH 1/4] [AArch64]Fix invalid use of ld1/st1 in stack alloc
This patch fixes invalid usage of scalar+immediate variant of ld1/st1 instructions during stack allocation caused by c4bac7f. This commit used ld1/st1 even when stack offset was outside of immediate range for this instruction, producing invalid assembly.
---
.../Target/AArch64/AArch64FrameLowering.cpp | 19 +-
llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll | 68 +-
.../CodeGen/AArch64/sme2-intrinsics-ld1.ll | 1456 +++++++++++------
.../AArch64/sve-callee-save-restore-pairs.ll | 84 +-
4 files changed, 1020 insertions(+), 607 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index e0f8853b715354..aba950dba8a5a5 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -3019,6 +3019,7 @@ static void computeCalleeSaveRegisterPairs(
ByteOffset += StackFillDir * StackHazardSize;
LastReg = RPI.Reg1;
+ int Scale = RPI.getScale();
// Add the next reg to the pair if it is in the same register class.
if (unsigned(i + RegInc) < Count && !AFI->hasStackHazardSlotIndex()) {
Register NextReg = CSI[i + RegInc].getReg();
@@ -3044,9 +3045,14 @@ static void computeCalleeSaveRegisterPairs(
case RegPairInfo::PPR:
break;
case RegPairInfo::ZPR:
- if (AFI->getPredicateRegForFillSpill() != 0)
- if (((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1))
+ if (AFI->getPredicateRegForFillSpill() != 0 &&
+ ((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1)) {
+ // Calculate offset of register pair to see if pair instruction can be
+ // used.
+ int Offset = (ScalableByteOffset + StackFillDir * 2 * Scale) / Scale;
+ if ((-17 < Offset && Offset < 15) && (Offset % 2 == 0))
RPI.Reg2 = NextReg;
+ }
break;
case RegPairInfo::VG:
break;
@@ -3086,7 +3092,6 @@ static void computeCalleeSaveRegisterPairs(
if (NeedsWinCFI &&
RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair
RPI.FrameIdx = CSI[i + RegInc].getFrameIdx();
- int Scale = RPI.getScale();
int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
assert(OffsetPre % Scale == 0);
@@ -3355,8 +3360,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
MachineMemOperand::MOStore, Size, Alignment));
MIB.addReg(PnReg);
MIB.addReg(AArch64::SP)
- .addImm(RPI.Offset) // [sp, #offset*scale],
- // where factor*scale is implicit
+ .addImm(RPI.Offset / 2) // [sp, #offset*2*scale],
+ // where scale is implicit
.setMIFlag(MachineInstr::FrameSetup);
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
@@ -3522,8 +3527,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
MachineMemOperand::MOLoad, Size, Alignment));
MIB.addReg(PnReg);
MIB.addReg(AArch64::SP)
- .addImm(RPI.Offset) // [sp, #offset*scale]
- // where factor*scale is implicit
+ .addImm(RPI.Offset / 2) // [sp, #offset*2*scale]
+ // where scale is implicit
.setMIFlag(MachineInstr::FrameDestroy);
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
index 6264ce0cf4ae6d..405a356bf9fb25 100644
--- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
+++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
@@ -332,16 +332,16 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
-; CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
-; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
-; CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
@@ -349,7 +349,8 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 32 - 8 * VG
; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 32 - 16 * VG
; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 32 - 24 * VG
@@ -372,15 +373,16 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 144 * VG
; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -427,16 +429,16 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; FP-CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: ptrue pn8.b
; FP-CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
@@ -444,7 +446,8 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; FP-CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 48 - 8 * VG
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 48 - 16 * VG
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 48 - 24 * VG
@@ -465,15 +468,16 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; FP-CHECK-NEXT: .cfi_restore vg
; FP-CHECK-NEXT: addvl sp, sp, #1
; FP-CHECK-NEXT: ptrue pn8.b
+; FP-CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
; FP-CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
; FP-CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; FP-CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; FP-CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
index 29d3d68fc4c3de..c63899cf7d2575 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
@@ -55,31 +55,45 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -89,14 +103,20 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -109,15 +129,21 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -184,31 +210,45 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0, x1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -218,14 +258,20 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -238,15 +284,21 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -314,31 +366,45 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -348,14 +414,20 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -368,15 +440,21 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -443,31 +521,45 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0, x1, lsl #1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -477,14 +569,20 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -497,15 +595,21 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -573,31 +677,45 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1w { z0.s, z8.s }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -607,14 +725,20 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -627,15 +751,21 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -702,31 +832,45 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1w { z0.s, z8.s }, pn8/z, [x0, x1, lsl #2]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -736,14 +880,20 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -756,15 +906,21 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -832,31 +988,45 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1d { z0.d, z8.d }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -866,14 +1036,20 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -886,15 +1062,21 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -961,31 +1143,45 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1d { z0.d, z8.d }, pn8/z, [x0, x1, lsl #3]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -995,14 +1191,20 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1015,15 +1217,21 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1093,32 +1301,46 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1130,13 +1352,18 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1154,14 +1381,19 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1234,32 +1466,46 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0, x1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1271,13 +1517,18 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1295,14 +1546,19 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1376,32 +1632,46 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1413,13 +1683,18 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1437,14 +1712,19 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1518,32 +1798,46 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0, x1, lsl #1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1555,13 +1849,18 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1579,14 +1878,19 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1660,32 +1964,46 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1697,13 +2015,18 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1721,14 +2044,19 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1802,32 +2130,46 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0, x1, lsl #2]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1839,13 +2181,18 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1863,14 +2210,19 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1944,32 +2296,46 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1981,13 +2347,18 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -2005,14 +2376,19 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2086,32 +2462,46 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0, x1, lsl #3]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2123,13 +2513,18 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -2147,14 +2542,19 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
index 470c0dd45782c9..c9d216935edbf3 100644
--- a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
+++ b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
@@ -91,16 +91,16 @@ define void @fbyte(<vscale x 16 x i8> %v) {
; PAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: ptrue pn8.b
; PAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
-; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
-; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
-; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
@@ -108,7 +108,8 @@ define void @fbyte(<vscale x 16 x i8> %v) {
; PAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
; PAIR-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
; PAIR-NEXT: .cfi_offset w30, -8
; PAIR-NEXT: .cfi_offset w29, -16
@@ -122,15 +123,16 @@ define void @fbyte(<vscale x 16 x i8> %v) {
; PAIR-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
; PAIR-NEXT: bl my_func
; PAIR-NEXT: ptrue pn8.b
+; PAIR-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
; PAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
; PAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; PAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; PAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -233,16 +235,16 @@ define void @fhalf(<vscale x 8 x half> %v) {
; PAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: ptrue pn8.b
; PAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
-; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
-; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
-; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
@@ -250,7 +252,8 @@ define void @fhalf(<vscale x 8 x half> %v) {
; PAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
; PAIR-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
; PAIR-NEXT: .cfi_offset w30, -8
; PAIR-NEXT: .cfi_offset w29, -16
@@ -264,15 +267,16 @@ define void @fhalf(<vscale x 8 x half> %v) {
; PAIR-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
; PAIR-NEXT: bl my_func
; PAIR-NEXT: ptrue pn8.b
+; PAIR-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
; PAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
; PAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; PAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; PAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -328,7 +332,7 @@ define aarch64_sve_vector_pcs void @test_clobbers_z_p_regs() {
; PAIR-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill
-; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
; PAIR-NEXT: .cfi_offset w29, -16
; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
@@ -338,7 +342,7 @@ define aarch64_sve_vector_pcs void @test_clobbers_z_p_regs() {
; PAIR-NEXT: //NO_APP
; PAIR-NEXT: ptrue pn8.b
; PAIR-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
-; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; PAIR-NEXT: ldr p8, [sp, #5, mul vl] // 2-byte Folded Reload
; PAIR-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
; PAIR-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -385,7 +389,7 @@ define aarch64_sve_vector_pcs void @test_clobbers_z_p_regs2() {
; PAIR-NEXT: ptrue pn9.b
; PAIR-NEXT: str p10, [sp, #6, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill
-; PAIR-NEXT: st1b { z8.b, z9.b }, pn9, [sp, #4, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z8.b, z9.b }, pn9, [sp, #2, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
; PAIR-NEXT: .cfi_offset w29, -16
; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
@@ -396,7 +400,7 @@ define aarch64_sve_vector_pcs void @test_clobbers_z_p_regs2() {
; PAIR-NEXT: ptrue pn9.b
; PAIR-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
; PAIR-NEXT: ldr p10, [sp, #6, mul vl] // 2-byte Folded Reload
-; PAIR-NEXT: ld1b { z8.b, z9.b }, pn9/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z8.b, z9.b }, pn9/z, [sp, #2, mul vl] // 32-byte Folded Reload
; PAIR-NEXT: ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload
; PAIR-NEXT: addvl sp, sp, #4
; PAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -431,17 +435,17 @@ define aarch64_sve_vector_pcs void @test_clobbers_z_regs() {
; PAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; PAIR-NEXT: addvl sp, sp, #-3
; PAIR-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: ptrue pn8.b
-; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill
; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
; PAIR-NEXT: .cfi_offset w29, -16
; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
; PAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
; PAIR-NEXT: //APP
; PAIR-NEXT: //NO_APP
-; PAIR-NEXT: ptrue pn8.b
-; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; PAIR-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; PAIR-NEXT: addvl sp, sp, #3
; PAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; PAIR-NEXT: ret
>From 5c8aa9ff084912f4452b27c22397a6bd68a23a14 Mon Sep 17 00:00:00 2001
From: Lukacma <Marian.Lukac at arm.com>
Date: Wed, 4 Sep 2024 14:10:52 +0100
Subject: [PATCH 2/4] Update llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
Co-authored-by: Sander de Smalen <sander.desmalen at arm.com>
---
llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index aba950dba8a5a5..beb47ae05c3712 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -3050,7 +3050,7 @@ static void computeCalleeSaveRegisterPairs(
// Calculate offset of register pair to see if pair instruction can be
// used.
int Offset = (ScalableByteOffset + StackFillDir * 2 * Scale) / Scale;
- if ((-17 < Offset && Offset < 15) && (Offset % 2 == 0))
+ if ((-16 <= Offset && Offset <= 14) && (Offset % 2 == 0))
RPI.Reg2 = NextReg;
}
break;
>From cccdb5137c39e3730af139d88cdfe0bde5658ef1 Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Thu, 5 Sep 2024 11:20:19 +0000
Subject: [PATCH 3/4] Fix comments and broken testcase
---
.../Target/AArch64/AArch64FrameLowering.cpp | 12 +-
.../CodeGen/AArch64/sme2-intrinsics-ldnt1.ll | 1456 +++++++++++------
2 files changed, 934 insertions(+), 534 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index beb47ae05c3712..ce18f3aee302f8 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -3360,8 +3360,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
MachineMemOperand::MOStore, Size, Alignment));
MIB.addReg(PnReg);
MIB.addReg(AArch64::SP)
- .addImm(RPI.Offset / 2) // [sp, #offset*2*scale],
- // where scale is implicit
+ .addImm(RPI.Offset / 2) // [sp, #imm*2*scale],
+ // where 2*scale is implicit
.setMIFlag(MachineInstr::FrameSetup);
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
@@ -3382,8 +3382,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
}
MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
.addReg(AArch64::SP)
- .addImm(RPI.Offset) // [sp, #offset*scale],
- // where factor*scale is implicit
+ .addImm(RPI.Offset) // [sp, #offset*vscale],
+ // where factor*vscale is implicit
.setMIFlag(MachineInstr::FrameSetup);
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
@@ -3527,8 +3527,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
MachineMemOperand::MOLoad, Size, Alignment));
MIB.addReg(PnReg);
MIB.addReg(AArch64::SP)
- .addImm(RPI.Offset / 2) // [sp, #offset*2*scale]
- // where scale is implicit
+ .addImm(RPI.Offset / 2) // [sp, #imm*2*vscale]
+ // where 2*vscale is implicit
.setMIFlag(MachineInstr::FrameDestroy);
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
index 3d3748e1011228..05241f788d3ead 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
@@ -8,31 +8,45 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1b { z0.b, z8.b }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -42,14 +56,20 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -62,15 +82,21 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -90,31 +116,45 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1b { z0.b, z8.b }, pn8/z, [x0, x1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -124,14 +164,20 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -144,15 +190,21 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -173,31 +225,45 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1h { z0.h, z8.h }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -207,14 +273,20 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -227,15 +299,21 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -255,31 +333,45 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1h { z0.h, z8.h }, pn8/z, [x0, x1, lsl #1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -289,14 +381,20 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -309,15 +407,21 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -338,31 +442,45 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1w { z0.s, z8.s }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -372,14 +490,20 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -392,15 +516,21 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -420,31 +550,45 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1w { z0.s, z8.s }, pn8/z, [x0, x1, lsl #2]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -454,14 +598,20 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -474,15 +624,21 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -503,31 +659,45 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1d { z0.d, z8.d }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -537,14 +707,20 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -557,15 +733,21 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -585,31 +767,45 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1d { z0.d, z8.d }, pn8/z, [x0, x1, lsl #3]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -619,14 +815,20 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -639,15 +841,21 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -668,32 +876,46 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -705,13 +927,18 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -729,14 +956,19 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -760,32 +992,46 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0, x1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -797,13 +1043,18 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -821,14 +1072,19 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -853,32 +1109,46 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -890,13 +1160,18 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -914,14 +1189,19 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -945,32 +1225,46 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0, x1, lsl #1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -982,13 +1276,18 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1006,14 +1305,19 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1038,32 +1342,46 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1075,13 +1393,18 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1099,14 +1422,19 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1130,32 +1458,46 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0, x1, lsl #2]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1167,13 +1509,18 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1191,14 +1538,19 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1223,32 +1575,46 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1260,13 +1626,18 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1284,14 +1655,19 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1315,32 +1691,46 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0, x1, lsl #3]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1352,13 +1742,18 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1376,14 +1771,19 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
>From b2682f26085441e8ae03768e4874d8dc0cd2669c Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Thu, 5 Sep 2024 12:30:08 +0000
Subject: [PATCH 4/4] Fix comments
---
llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index a3962e11419842..7e041b086599b9 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -3361,8 +3361,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
MachineMemOperand::MOStore, Size, Alignment));
MIB.addReg(PnReg);
MIB.addReg(AArch64::SP)
- .addImm(RPI.Offset / 2) // [sp, #imm*2*scale],
- // where 2*scale is implicit
+ .addImm(RPI.Offset / 2) // [sp, #imm*2*vscale],
+ // where 2*vscale is implicit
.setMIFlag(MachineInstr::FrameSetup);
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
@@ -3546,8 +3546,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
}
MIB.addReg(Reg1, getDefRegState(true));
MIB.addReg(AArch64::SP)
- .addImm(RPI.Offset) // [sp, #offset*scale]
- // where factor*scale is implicit
+ .addImm(RPI.Offset) // [sp, #offset*vscale]
+ // where factor*vscale is implicit
.setMIFlag(MachineInstr::FrameDestroy);
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
More information about the llvm-commits
mailing list