[llvm] [LLVM][AArch64]Use load/store with consecutive registers in SME2 or S… (PR #77665)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 10 10:42:08 PST 2024
https://github.com/CarolineConcatto created https://github.com/llvm/llvm-project/pull/77665
…VE2.1 for spill/fill
When possible the spill/fill register in Frame Lowering uses the ld/st consecutive pairs available in sme or sve2.1.
>From 75c621bd18667664a4263ab4e81d169be0563614 Mon Sep 17 00:00:00 2001
From: Caroline Concatto <caroline.concatto at arm.com>
Date: Wed, 10 Jan 2024 17:23:04 +0000
Subject: [PATCH] [LLVM][AArch64]Use load/store with consecutive registers in
SME2 or SVE2.1 for spill/fill
When possible the spill/fill register in Frame Lowering uses the ld/st
consecutive pairs available in sme or sve2.1.
---
.../Target/AArch64/AArch64FrameLowering.cpp | 100 +-
.../AArch64/AArch64MachineFunctionInfo.h | 2 +
.../CodeGen/AArch64/sme2-intrinsics-ld1.ll | 1456 ++++++-----------
.../CodeGen/AArch64/sme2-intrinsics-ldnt1.ll | 1456 ++++++-----------
.../AArch64/sve2p1-intrinsics-ldst-pair.ll | 295 ++++
5 files changed, 1431 insertions(+), 1878 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ldst-pair.ll
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index caab59201a8d69..7a332bfd18c21b 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1480,6 +1480,11 @@ static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
switch (I->getOpcode()) {
default:
return false;
+ case AArch64::PTRUE_C_B:
+ case AArch64::LD1B_2Z_IMM:
+ case AArch64::ST1B_2Z_IMM:
+ return I->getMF()->getSubtarget<AArch64Subtarget>().hasSVE2p1() ||
+ I->getMF()->getSubtarget<AArch64Subtarget>().hasSME2();
case AArch64::STR_ZXI:
case AArch64::STR_PXI:
case AArch64::LDR_ZXI:
@@ -2753,6 +2758,16 @@ struct RegPairInfo {
} // end anonymous namespace
+unsigned findFreePredicateAsCounterReg(MachineFunction &MF) {
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ for (MCRegister PReg :
+ {AArch64::PN8, AArch64::PN9, AArch64::PN10, AArch64::PN11, AArch64::PN12,
+ AArch64::PN13, AArch64::PN14, AArch64::PN15}) {
+ if (!MRI.isReserved(PReg))
+ return PReg;
+ }
+ llvm_unreachable("cannot find a free predicate");
+}
static void computeCalleeSaveRegisterPairs(
MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
@@ -2763,6 +2778,7 @@ static void computeCalleeSaveRegisterPairs(
bool IsWindows = isTargetWindows(MF);
bool NeedsWinCFI = needsWinCFI(MF);
+ const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
MachineFrameInfo &MFI = MF.getFrameInfo();
CallingConv::ID CC = MF.getFunction().getCallingConv();
@@ -2831,7 +2847,11 @@ static void computeCalleeSaveRegisterPairs(
RPI.Reg2 = NextReg;
break;
case RegPairInfo::PPR:
+ break;
case RegPairInfo::ZPR:
+ if (Subtarget.hasSVE2p1() || Subtarget.hasSME2())
+ if (((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1))
+ RPI.Reg2 = NextReg;
break;
}
}
@@ -2876,7 +2896,7 @@ static void computeCalleeSaveRegisterPairs(
assert(OffsetPre % Scale == 0);
if (RPI.isScalable())
- ScalableByteOffset += StackFillDir * Scale;
+ ScalableByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
else
ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
@@ -2887,9 +2907,6 @@ static void computeCalleeSaveRegisterPairs(
(IsWindows && RPI.Reg2 == AArch64::LR)))
ByteOffset += StackFillDir * 8;
- assert(!(RPI.isScalable() && RPI.isPaired()) &&
- "Paired spill/fill instructions don't exist for SVE vectors");
-
// Round up size of non-pair to pair size if we need to pad the
// callee-save area to ensure 16-byte alignment.
if (NeedGapToAlignStack && !NeedsWinCFI &&
@@ -2976,6 +2993,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
}
return true;
}
+ bool PtrueCreated = false;
for (const RegPairInfo &RPI : llvm::reverse(RegPairs)) {
unsigned Reg1 = RPI.Reg1;
unsigned Reg2 = RPI.Reg2;
@@ -3010,10 +3028,10 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
Alignment = Align(16);
break;
case RegPairInfo::ZPR:
- StrOpc = AArch64::STR_ZXI;
- Size = 16;
- Alignment = Align(16);
- break;
+ StrOpc = RPI.isPaired() ? AArch64::ST1B_2Z_IMM : AArch64::STR_ZXI;
+ Size = 16;
+ Alignment = Align(16);
+ break;
case RegPairInfo::PPR:
StrOpc = AArch64::STR_PXI;
Size = 2;
@@ -3037,19 +3055,37 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
std::swap(Reg1, Reg2);
std::swap(FrameIdxReg1, FrameIdxReg2);
}
+
+ unsigned PnReg;
+ unsigned PairRegs;
+ if (RPI.isPaired() && RPI.isScalable()) {
+ PnReg = findFreePredicateAsCounterReg(MF);
+ PairRegs = AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0);
+ if (!PtrueCreated) {
+ PtrueCreated = true;
+ BuildMI(MBB, MI, DL, TII.get(AArch64::PTRUE_C_B), PnReg)
+ .setMIFlags(MachineInstr::FrameDestroy);
+ }
+ }
MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
if (!MRI.isReserved(Reg1))
MBB.addLiveIn(Reg1);
if (RPI.isPaired()) {
if (!MRI.isReserved(Reg2))
MBB.addLiveIn(Reg2);
- MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
+ if (RPI.isScalable())
+ MIB.addReg(PairRegs);
+ else
+ MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
MachineMemOperand::MOStore, Size, Alignment));
}
- MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
- .addReg(AArch64::SP)
+ if (RPI.isPaired() && RPI.isScalable())
+ MIB.addReg(PnReg);
+ else
+ MIB.addReg(Reg1, getPrologueDeath(MF, Reg1));
+ MIB.addReg(AArch64::SP)
.addImm(RPI.Offset) // [sp, #offset*scale],
// where factor*scale is implicit
.setMIFlag(MachineInstr::FrameSetup);
@@ -3061,8 +3097,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
// Update the StackIDs of the SVE stack slots.
MachineFrameInfo &MFI = MF.getFrameInfo();
- if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR)
- MFI.setStackID(RPI.FrameIdx, TargetStackID::ScalableVector);
+ if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR) {
+ MFI.setStackID(FrameIdxReg1, TargetStackID::ScalableVector);
+ if (RPI.isPaired())
+ MFI.setStackID(FrameIdxReg2, TargetStackID::ScalableVector);
+ }
}
return true;
@@ -3082,7 +3121,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
- auto EmitMI = [&](const RegPairInfo &RPI) -> MachineBasicBlock::iterator {
+ auto EmitMI = [&](const RegPairInfo &RPI,
+ bool *PtrueCreated) -> MachineBasicBlock::iterator {
unsigned Reg1 = RPI.Reg1;
unsigned Reg2 = RPI.Reg2;
@@ -3114,7 +3154,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
Alignment = Align(16);
break;
case RegPairInfo::ZPR:
- LdrOpc = AArch64::LDR_ZXI;
+ LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI;
Size = 16;
Alignment = Align(16);
break;
@@ -3139,15 +3179,31 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
std::swap(Reg1, Reg2);
std::swap(FrameIdxReg1, FrameIdxReg2);
}
+
+ unsigned PnReg;
+ unsigned PairRegs;
+ if (RPI.isPaired() && RPI.isScalable()) {
+ PnReg = findFreePredicateAsCounterReg(MF);
+ PairRegs = AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0);
+ if (!*PtrueCreated) {
+ *PtrueCreated = true;
+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::PTRUE_C_B), PnReg)
+ .setMIFlags(MachineInstr::FrameDestroy);
+ }
+ }
+
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc));
if (RPI.isPaired()) {
- MIB.addReg(Reg2, getDefRegState(true));
+ MIB.addReg(RPI.isScalable() ? PairRegs : Reg2, getDefRegState(true));
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
MachineMemOperand::MOLoad, Size, Alignment));
}
- MIB.addReg(Reg1, getDefRegState(true))
- .addReg(AArch64::SP)
+ if (RPI.isPaired() && RPI.isScalable())
+ MIB.addReg(PnReg);
+ else
+ MIB.addReg(Reg1, getDefRegState(true));
+ MIB.addReg(AArch64::SP)
.addImm(RPI.Offset) // [sp, #offset*scale]
// where factor*scale is implicit
.setMIFlag(MachineInstr::FrameDestroy);
@@ -3161,9 +3217,10 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
};
// SVE objects are always restored in reverse order.
+ bool PtrueCreated = false;
for (const RegPairInfo &RPI : reverse(RegPairs))
if (RPI.isScalable())
- EmitMI(RPI);
+ EmitMI(RPI, &PtrueCreated);
if (homogeneousPrologEpilog(MF, &MBB)) {
auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
@@ -3174,13 +3231,12 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
}
return true;
}
-
if (ReverseCSRRestoreSeq) {
MachineBasicBlock::iterator First = MBB.end();
for (const RegPairInfo &RPI : reverse(RegPairs)) {
if (RPI.isScalable())
continue;
- MachineBasicBlock::iterator It = EmitMI(RPI);
+ MachineBasicBlock::iterator It = EmitMI(RPI, &PtrueCreated);
if (First == MBB.end())
First = It;
}
@@ -3190,7 +3246,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
for (const RegPairInfo &RPI : RegPairs) {
if (RPI.isScalable())
continue;
- (void)EmitMI(RPI);
+ (void)EmitMI(RPI, &PtrueCreated);
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index cd4a18bfbc23a8..b44cc8d0d0dc9b 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -300,6 +300,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
int FrameIdx = Info.getFrameIdx();
if (MFI.getStackID(FrameIdx) != TargetStackID::Default)
continue;
+ if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector)
+ continue;
int64_t Offset = MFI.getObjectOffset(FrameIdx);
int64_t ObjSize = MFI.getObjectSize(FrameIdx);
MinOffset = std::min<int64_t>(Offset, MinOffset);
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
index b7119fc0825673..6c94546c9525aa 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
@@ -55,45 +55,31 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -103,20 +89,14 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -130,21 +110,15 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -210,45 +184,31 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0, x1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -258,20 +218,14 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -285,21 +239,15 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -366,45 +314,31 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -414,20 +348,14 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -441,21 +369,15 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -521,45 +443,31 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0, x1, lsl #1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -569,20 +477,14 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -596,21 +498,15 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -677,45 +573,31 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1w { z0.s, z8.s }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -725,20 +607,14 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -752,21 +628,15 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -832,45 +702,31 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1w { z0.s, z8.s }, pn8/z, [x0, x1, lsl #2]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -880,20 +736,14 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -907,21 +757,15 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -988,45 +832,31 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1d { z0.d, z8.d }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -1036,20 +866,14 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1063,21 +887,15 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1143,45 +961,31 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1d { z0.d, z8.d }, pn8/z, [x0, x1, lsl #3]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -1191,20 +995,14 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1218,21 +1016,15 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1301,46 +1093,32 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1351,19 +1129,14 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1381,20 +1154,15 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1466,46 +1234,32 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0, x1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1516,19 +1270,14 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1546,20 +1295,15 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1632,46 +1376,32 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1682,19 +1412,14 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1712,20 +1437,15 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1798,46 +1518,32 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0, x1, lsl #1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1848,19 +1554,14 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1878,20 +1579,15 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1964,46 +1660,32 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2014,19 +1696,14 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -2044,20 +1721,15 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -2130,46 +1802,32 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0, x1, lsl #2]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2180,19 +1838,14 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -2210,20 +1863,15 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -2296,46 +1944,32 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2346,19 +1980,14 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -2376,20 +2005,15 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -2462,46 +2086,32 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0, x1, lsl #3]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2512,19 +2122,14 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -2542,20 +2147,15 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
index 1fb251a4f628e9..1df619a40ef3fa 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
@@ -8,45 +8,31 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1b { z0.b, z8.b }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -56,20 +42,14 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -83,21 +63,15 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -116,45 +90,31 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1b { z0.b, z8.b }, pn8/z, [x0, x1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -164,20 +124,14 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -191,21 +145,15 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -225,45 +173,31 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1h { z0.h, z8.h }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -273,20 +207,14 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -300,21 +228,15 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -333,45 +255,31 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1h { z0.h, z8.h }, pn8/z, [x0, x1, lsl #1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -381,20 +289,14 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -408,21 +310,15 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -442,45 +338,31 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1w { z0.s, z8.s }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -490,20 +372,14 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -517,21 +393,15 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -550,45 +420,31 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1w { z0.s, z8.s }, pn8/z, [x0, x1, lsl #2]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -598,20 +454,14 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -625,21 +475,15 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -659,45 +503,31 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1d { z0.d, z8.d }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -707,20 +537,14 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -734,21 +558,15 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -767,45 +585,31 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1d { z0.d, z8.d }, pn8/z, [x0, x1, lsl #3]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -815,20 +619,14 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -842,21 +640,15 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -876,46 +668,32 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -926,19 +704,14 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -956,20 +729,15 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -992,46 +760,32 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0, x1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1042,19 +796,14 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1072,20 +821,15 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1109,46 +853,32 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1159,19 +889,14 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1189,20 +914,15 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1225,46 +945,32 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0, x1, lsl #1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1275,19 +981,14 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1305,20 +1006,15 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1342,46 +1038,32 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1392,19 +1074,14 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1422,20 +1099,15 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1458,46 +1130,32 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0, x1, lsl #2]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1508,19 +1166,14 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1538,20 +1191,15 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1575,46 +1223,32 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1625,19 +1259,14 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1655,20 +1284,15 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1691,46 +1315,32 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0, x1, lsl #3]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1741,19 +1351,14 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1771,20 +1376,15 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ldst-pair.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ldst-pair.ll
new file mode 100644
index 00000000000000..c7f38861102c00
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ldst-pair.ll
@@ -0,0 +1,295 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s --check-prefixes=NOPAIR
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=NOPAIR
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=PAIR
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=PAIR
+
+
+declare void @my_func()
+
+define void @fbyte(<vscale x 16 x i8> %v) {
+; NOPAIR-LABEL: fbyte:
+; NOPAIR: // %bb.0:
+; NOPAIR-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; NOPAIR-NEXT: addvl sp, sp, #-18
+; NOPAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
+; NOPAIR-NEXT: .cfi_offset w30, -8
+; NOPAIR-NEXT: .cfi_offset w29, -16
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; NOPAIR-NEXT: bl my_func
+; NOPAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: addvl sp, sp, #18
+; NOPAIR-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; NOPAIR-NEXT: ret
+;
+; PAIR-LABEL: fbyte:
+; PAIR: // %bb.0:
+; PAIR-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; PAIR-NEXT: addvl sp, sp, #-18
+; PAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: ptrue pn8.b
+; PAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
+; PAIR-NEXT: .cfi_offset w30, -8
+; PAIR-NEXT: .cfi_offset w29, -16
+; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; PAIR-NEXT: bl my_func
+; PAIR-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ptrue pn8.b
+; PAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: addvl sp, sp, #18
+; PAIR-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; PAIR-NEXT: ret
+ call void @my_func()
+ ret void
+}
+
+define void @fhalf(<vscale x 8 x half> %v) {
+; NOPAIR-LABEL: fhalf:
+; NOPAIR: // %bb.0:
+; NOPAIR-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; NOPAIR-NEXT: addvl sp, sp, #-18
+; NOPAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
+; NOPAIR-NEXT: .cfi_offset w30, -8
+; NOPAIR-NEXT: .cfi_offset w29, -16
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; NOPAIR-NEXT: bl my_func
+; NOPAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: addvl sp, sp, #18
+; NOPAIR-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; NOPAIR-NEXT: ret
+;
+; PAIR-LABEL: fhalf:
+; PAIR: // %bb.0:
+; PAIR-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; PAIR-NEXT: addvl sp, sp, #-18
+; PAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: ptrue pn8.b
+; PAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
+; PAIR-NEXT: .cfi_offset w30, -8
+; PAIR-NEXT: .cfi_offset w29, -16
+; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; PAIR-NEXT: bl my_func
+; PAIR-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ptrue pn8.b
+; PAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: addvl sp, sp, #18
+; PAIR-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; PAIR-NEXT: ret
+ call void @my_func()
+ ret void
+}
+
+
+
More information about the llvm-commits
mailing list