[llvm] c4bac7f - [LLVM][AArch64]Use load/store with consecutive registers in SME2 or S… (#77665)
via llvm-commits
llvm-commits at lists.llvm.org
Fri May 17 01:25:27 PDT 2024
Author: CarolineConcatto
Date: 2024-05-17T09:25:21+01:00
New Revision: c4bac7f7dcd931a5e561604e95656a24c3d1c9d9
URL: https://github.com/llvm/llvm-project/commit/c4bac7f7dcd931a5e561604e95656a24c3d1c9d9
DIFF: https://github.com/llvm/llvm-project/commit/c4bac7f7dcd931a5e561604e95656a24c3d1c9d9.diff
LOG: [LLVM][AArch64]Use load/store with consecutive registers in SME2 or S… (#77665)
…VE2.1 for spill/fill
When possible the spill/fill register in Frame Lowering uses the ld/st
consecutive pairs available in sme or sve2.1.
Added:
llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
Modified:
llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index c86c98eed24f0..239862756fc47 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1508,6 +1508,9 @@ static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
switch (I->getOpcode()) {
default:
return false;
+ case AArch64::PTRUE_C_B:
+ case AArch64::LD1B_2Z_IMM:
+ case AArch64::ST1B_2Z_IMM:
case AArch64::STR_ZXI:
case AArch64::STR_PXI:
case AArch64::LDR_ZXI:
@@ -2781,6 +2784,16 @@ struct RegPairInfo {
} // end anonymous namespace
+unsigned findFreePredicateReg(BitVector &SavedRegs) {
+ for (unsigned PReg = AArch64::P8; PReg <= AArch64::P15; ++PReg) {
+ if (SavedRegs.test(PReg)) {
+ unsigned PNReg = PReg - AArch64::P0 + AArch64::PN0;
+ return PNReg;
+ }
+ }
+ return AArch64::NoRegister;
+}
+
static void computeCalleeSaveRegisterPairs(
MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
@@ -2859,7 +2872,11 @@ static void computeCalleeSaveRegisterPairs(
RPI.Reg2 = NextReg;
break;
case RegPairInfo::PPR:
+ break;
case RegPairInfo::ZPR:
+ if (AFI->getPredicateRegForFillSpill() != 0)
+ if (((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1))
+ RPI.Reg2 = NextReg;
break;
}
}
@@ -2897,14 +2914,13 @@ static void computeCalleeSaveRegisterPairs(
if (NeedsWinCFI &&
RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair
RPI.FrameIdx = CSI[i + RegInc].getFrameIdx();
-
int Scale = RPI.getScale();
int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
assert(OffsetPre % Scale == 0);
if (RPI.isScalable())
- ScalableByteOffset += StackFillDir * Scale;
+ ScalableByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
else
ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
@@ -2915,9 +2931,6 @@ static void computeCalleeSaveRegisterPairs(
(IsWindows && RPI.Reg2 == AArch64::LR)))
ByteOffset += StackFillDir * 8;
- assert(!(RPI.isScalable() && RPI.isPaired()) &&
- "Paired spill/fill instructions don't exist for SVE vectors");
-
// Round up size of non-pair to pair size if we need to pad the
// callee-save area to ensure 16-byte alignment.
if (NeedGapToAlignStack && !NeedsWinCFI &&
@@ -3004,6 +3017,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
}
return true;
}
+ bool PTrueCreated = false;
for (const RegPairInfo &RPI : llvm::reverse(RegPairs)) {
unsigned Reg1 = RPI.Reg1;
unsigned Reg2 = RPI.Reg2;
@@ -3038,10 +3052,10 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
Alignment = Align(16);
break;
case RegPairInfo::ZPR:
- StrOpc = AArch64::STR_ZXI;
- Size = 16;
- Alignment = Align(16);
- break;
+ StrOpc = RPI.isPaired() ? AArch64::ST1B_2Z_IMM : AArch64::STR_ZXI;
+ Size = 16;
+ Alignment = Align(16);
+ break;
case RegPairInfo::PPR:
StrOpc = AArch64::STR_PXI;
Size = 2;
@@ -3065,33 +3079,79 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
std::swap(Reg1, Reg2);
std::swap(FrameIdxReg1, FrameIdxReg2);
}
- MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
- if (!MRI.isReserved(Reg1))
- MBB.addLiveIn(Reg1);
- if (RPI.isPaired()) {
+
+ if (RPI.isPaired() && RPI.isScalable()) {
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ unsigned PnReg = AFI->getPredicateRegForFillSpill();
+ assert(((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) && PnReg != 0) &&
+ "Expects SVE2.1 or SME2 target and a predicate register");
+#ifdef EXPENSIVE_CHECKS
+ auto IsPPR = [](const RegPairInfo &c) {
+ return c.Reg1 == RegPairInfo::PPR;
+ };
+ auto PPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsPPR);
+ auto IsZPR = [](const RegPairInfo &c) {
+ return c.Type == RegPairInfo::ZPR;
+ };
+ auto ZPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsZPR);
+ assert(!(PPRBegin < ZPRBegin) &&
+ "Expected callee save predicate to be handled first");
+#endif
+ if (!PTrueCreated) {
+ PTrueCreated = true;
+ BuildMI(MBB, MI, DL, TII.get(AArch64::PTRUE_C_B), PnReg)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
+ MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
+ if (!MRI.isReserved(Reg1))
+ MBB.addLiveIn(Reg1);
if (!MRI.isReserved(Reg2))
MBB.addLiveIn(Reg2);
- MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
+ MIB.addReg(/*PairRegs*/ AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0));
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
MachineMemOperand::MOStore, Size, Alignment));
+ MIB.addReg(PnReg);
+ MIB.addReg(AArch64::SP)
+ .addImm(RPI.Offset) // [sp, #offset*scale],
+ // where factor*scale is implicit
+ .setMIFlag(MachineInstr::FrameSetup);
+ MIB.addMemOperand(MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
+ MachineMemOperand::MOStore, Size, Alignment));
+ if (NeedsWinCFI)
+ InsertSEH(MIB, TII, MachineInstr::FrameSetup);
+ } else { // The code when the pair of ZReg is not present
+ MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
+ if (!MRI.isReserved(Reg1))
+ MBB.addLiveIn(Reg1);
+ if (RPI.isPaired()) {
+ if (!MRI.isReserved(Reg2))
+ MBB.addLiveIn(Reg2);
+ MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
+ MIB.addMemOperand(MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
+ MachineMemOperand::MOStore, Size, Alignment));
+ }
+ MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
+ .addReg(AArch64::SP)
+ .addImm(RPI.Offset) // [sp, #offset*scale],
+ // where factor*scale is implicit
+ .setMIFlag(MachineInstr::FrameSetup);
+ MIB.addMemOperand(MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
+ MachineMemOperand::MOStore, Size, Alignment));
+ if (NeedsWinCFI)
+ InsertSEH(MIB, TII, MachineInstr::FrameSetup);
}
- MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
- .addReg(AArch64::SP)
- .addImm(RPI.Offset) // [sp, #offset*scale],
- // where factor*scale is implicit
- .setMIFlag(MachineInstr::FrameSetup);
- MIB.addMemOperand(MF.getMachineMemOperand(
- MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
- MachineMemOperand::MOStore, Size, Alignment));
- if (NeedsWinCFI)
- InsertSEH(MIB, TII, MachineInstr::FrameSetup);
-
// Update the StackIDs of the SVE stack slots.
MachineFrameInfo &MFI = MF.getFrameInfo();
- if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR)
- MFI.setStackID(RPI.FrameIdx, TargetStackID::ScalableVector);
-
+ if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR) {
+ MFI.setStackID(FrameIdxReg1, TargetStackID::ScalableVector);
+ if (RPI.isPaired())
+ MFI.setStackID(FrameIdxReg2, TargetStackID::ScalableVector);
+ }
}
return true;
}
@@ -3109,7 +3169,6 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
DL = MBBI->getDebugLoc();
computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
-
if (homogeneousPrologEpilog(MF, &MBB)) {
auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
.setMIFlag(MachineInstr::FrameDestroy);
@@ -3130,6 +3189,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
auto ZPREnd = std::find_if_not(ZPRBegin, RegPairs.end(), IsZPR);
std::reverse(ZPRBegin, ZPREnd);
+ bool PTrueCreated = false;
for (const RegPairInfo &RPI : RegPairs) {
unsigned Reg1 = RPI.Reg1;
unsigned Reg2 = RPI.Reg2;
@@ -3162,7 +3222,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
Alignment = Align(16);
break;
case RegPairInfo::ZPR:
- LdrOpc = AArch64::LDR_ZXI;
+ LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI;
Size = 16;
Alignment = Align(16);
break;
@@ -3187,25 +3247,58 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
std::swap(Reg1, Reg2);
std::swap(FrameIdxReg1, FrameIdxReg2);
}
- MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc));
- if (RPI.isPaired()) {
- MIB.addReg(Reg2, getDefRegState(true));
+
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ if (RPI.isPaired() && RPI.isScalable()) {
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ unsigned PnReg = AFI->getPredicateRegForFillSpill();
+ assert(((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) && PnReg != 0) &&
+ "Expects SVE2.1 or SME2 target and a predicate register");
+#ifdef EXPENSIVE_CHECKS
+ assert(!(PPRBegin < ZPRBegin) &&
+ "Expected callee save predicate to be handled first");
+#endif
+ if (!PTrueCreated) {
+ PTrueCreated = true;
+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::PTRUE_C_B), PnReg)
+ .setMIFlags(MachineInstr::FrameDestroy);
+ }
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc));
+ MIB.addReg(/*PairRegs*/ AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0),
+ getDefRegState(true));
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
MachineMemOperand::MOLoad, Size, Alignment));
+ MIB.addReg(PnReg);
+ MIB.addReg(AArch64::SP)
+ .addImm(RPI.Offset) // [sp, #offset*scale]
+ // where factor*scale is implicit
+ .setMIFlag(MachineInstr::FrameDestroy);
+ MIB.addMemOperand(MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
+ MachineMemOperand::MOLoad, Size, Alignment));
+ if (NeedsWinCFI)
+ InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
+ } else {
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc));
+ if (RPI.isPaired()) {
+ MIB.addReg(Reg2, getDefRegState(true));
+ MIB.addMemOperand(MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
+ MachineMemOperand::MOLoad, Size, Alignment));
+ }
+ MIB.addReg(Reg1, getDefRegState(true));
+ MIB.addReg(AArch64::SP)
+ .addImm(RPI.Offset) // [sp, #offset*scale]
+ // where factor*scale is implicit
+ .setMIFlag(MachineInstr::FrameDestroy);
+ MIB.addMemOperand(MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
+ MachineMemOperand::MOLoad, Size, Alignment));
+ if (NeedsWinCFI)
+ InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
}
- MIB.addReg(Reg1, getDefRegState(true))
- .addReg(AArch64::SP)
- .addImm(RPI.Offset) // [sp, #offset*scale]
- // where factor*scale is implicit
- .setMIFlag(MachineInstr::FrameDestroy);
- MIB.addMemOperand(MF.getMachineMemOperand(
- MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
- MachineMemOperand::MOLoad, Size, Alignment));
- if (NeedsWinCFI)
- InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
}
-
return true;
}
@@ -3234,6 +3327,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
unsigned ExtraCSSpill = 0;
bool HasUnpairedGPR64 = false;
+ bool HasPairZReg = false;
// Figure out which callee-saved registers to save/restore.
for (unsigned i = 0; CSRegs[i]; ++i) {
const unsigned Reg = CSRegs[i];
@@ -3287,6 +3381,28 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
!RegInfo->isReservedReg(MF, PairedReg))
ExtraCSSpill = PairedReg;
}
+ // Check if there is a pair of ZRegs, so it can select PReg for spill/fill
+ HasPairZReg |= (AArch64::ZPRRegClass.contains(Reg, CSRegs[i ^ 1]) &&
+ SavedRegs.test(CSRegs[i ^ 1]));
+ }
+
+ if (HasPairZReg && (Subtarget.hasSVE2p1() || Subtarget.hasSME2())) {
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ // Find a suitable predicate register for the multi-vector spill/fill
+ // instructions.
+ unsigned PnReg = findFreePredicateReg(SavedRegs);
+ if (PnReg != AArch64::NoRegister)
+ AFI->setPredicateRegForFillSpill(PnReg);
+ // If no free callee-save has been found assign one.
+ if (!AFI->getPredicateRegForFillSpill() &&
+ MF.getFunction().getCallingConv() ==
+ CallingConv::AArch64_SVE_VectorCall) {
+ SavedRegs.set(AArch64::P8);
+ AFI->setPredicateRegForFillSpill(AArch64::PN8);
+ }
+
+ assert(!RegInfo->isReservedReg(MF, AFI->getPredicateRegForFillSpill()) &&
+ "Predicate cannot be a reserved register");
}
if (MF.getFunction().getCallingConv() == CallingConv::Win64 &&
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index d5941e6284111..df09fc5592edf 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -212,6 +212,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
// on function entry to record the initial pstate of a function.
Register PStateSMReg = MCRegister::NoRegister;
+ // Has the PNReg used to build PTRUE instruction.
+ // The PTRUE is used for the LD/ST of ZReg pairs in save and restore.
+ unsigned PredicateRegForFillSpill = 0;
+
public:
AArch64FunctionInfo(const Function &F, const AArch64Subtarget *STI);
@@ -220,6 +224,13 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
const override;
+ void setPredicateRegForFillSpill(unsigned Reg) {
+ PredicateRegForFillSpill = Reg;
+ }
+ unsigned getPredicateRegForFillSpill() const {
+ return PredicateRegForFillSpill;
+ }
+
Register getPStateSMReg() const { return PStateSMReg; };
void setPStateSMReg(Register Reg) { PStateSMReg = Reg; };
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
index ea7808d73093e..3a94b0333e267 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
@@ -55,45 +55,31 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -103,20 +89,14 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -129,21 +109,15 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -210,45 +184,31 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0, x1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -258,20 +218,14 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -284,21 +238,15 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -366,45 +314,31 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -414,20 +348,14 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -440,21 +368,15 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -521,45 +443,31 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0, x1, lsl #1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -569,20 +477,14 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -595,21 +497,15 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -677,45 +573,31 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1w { z0.s, z8.s }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -725,20 +607,14 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -751,21 +627,15 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -832,45 +702,31 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1w { z0.s, z8.s }, pn8/z, [x0, x1, lsl #2]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -880,20 +736,14 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -906,21 +756,15 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -988,45 +832,31 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1d { z0.d, z8.d }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -1036,20 +866,14 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1062,21 +886,15 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1143,45 +961,31 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1d { z0.d, z8.d }, pn8/z, [x0, x1, lsl #3]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -1191,20 +995,14 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1217,21 +1015,15 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1301,46 +1093,32 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1351,19 +1129,14 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1380,20 +1153,15 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1466,46 +1234,32 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0, x1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1516,19 +1270,14 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1545,20 +1294,15 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1632,46 +1376,32 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1682,19 +1412,14 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1711,20 +1436,15 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1798,46 +1518,32 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0, x1, lsl #1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1848,19 +1554,14 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1877,20 +1578,15 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1964,46 +1660,32 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2014,19 +1696,14 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -2043,20 +1720,15 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2130,46 +1802,32 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0, x1, lsl #2]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2180,19 +1838,14 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -2209,20 +1862,15 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2296,46 +1944,32 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2346,19 +1980,14 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -2375,20 +2004,15 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2462,46 +2086,32 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0, x1, lsl #3]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2512,19 +2122,14 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -2541,20 +2146,15 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
index 7e2d28fbf7982..8ecb7c858c6a2 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
@@ -8,45 +8,31 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1b { z0.b, z8.b }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -56,20 +42,14 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -82,21 +62,15 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -116,45 +90,31 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1b { z0.b, z8.b }, pn8/z, [x0, x1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -164,20 +124,14 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -190,21 +144,15 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -225,45 +173,31 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1h { z0.h, z8.h }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -273,20 +207,14 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -299,21 +227,15 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -333,45 +255,31 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1h { z0.h, z8.h }, pn8/z, [x0, x1, lsl #1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -381,20 +289,14 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -407,21 +309,15 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -442,45 +338,31 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1w { z0.s, z8.s }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -490,20 +372,14 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -516,21 +392,15 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -550,45 +420,31 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1w { z0.s, z8.s }, pn8/z, [x0, x1, lsl #2]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -598,20 +454,14 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -624,21 +474,15 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -659,45 +503,31 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1d { z0.d, z8.d }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -707,20 +537,14 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -733,21 +557,15 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -767,45 +585,31 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1d { z0.d, z8.d }, pn8/z, [x0, x1, lsl #3]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -815,20 +619,14 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -841,21 +639,15 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -876,46 +668,32 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -926,19 +704,14 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -955,20 +728,15 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -992,46 +760,32 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0, x1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1042,19 +796,14 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1071,20 +820,15 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1109,46 +853,32 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1159,19 +889,14 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1188,20 +913,15 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1225,46 +945,32 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0, x1, lsl #1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1275,19 +981,14 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1304,20 +1005,15 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1342,46 +1038,32 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1392,19 +1074,14 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1421,20 +1098,15 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1458,46 +1130,32 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0, x1, lsl #2]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1508,19 +1166,14 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1537,20 +1190,15 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1575,46 +1223,32 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1625,19 +1259,14 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1654,20 +1283,15 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1691,46 +1315,32 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0, x1, lsl #3]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1741,19 +1351,14 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1770,20 +1375,15 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
new file mode 100644
index 0000000000000..c62016d8ea01a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
@@ -0,0 +1,556 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s --check-prefixes=NOPAIR
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=NOPAIR
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=PAIR
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=PAIR
+
+
+declare void @my_func()
+
+define void @fbyte(<vscale x 16 x i8> %v) {
+; NOPAIR-LABEL: fbyte:
+; NOPAIR: // %bb.0:
+; NOPAIR-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; NOPAIR-NEXT: addvl sp, sp, #-18
+; NOPAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
+; NOPAIR-NEXT: .cfi_offset w30, -8
+; NOPAIR-NEXT: .cfi_offset w29, -16
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; NOPAIR-NEXT: bl my_func
+; NOPAIR-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: addvl sp, sp, #18
+; NOPAIR-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; NOPAIR-NEXT: ret
+;
+; PAIR-LABEL: fbyte:
+; PAIR: // %bb.0:
+; PAIR-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; PAIR-NEXT: addvl sp, sp, #-18
+; PAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: ptrue pn8.b
+; PAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
+; PAIR-NEXT: .cfi_offset w30, -8
+; PAIR-NEXT: .cfi_offset w29, -16
+; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; PAIR-NEXT: bl my_func
+; PAIR-NEXT: ptrue pn8.b
+; PAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: addvl sp, sp, #18
+; PAIR-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; PAIR-NEXT: ret
+ call void @my_func()
+ ret void
+}
+
+define void @fhalf(<vscale x 8 x half> %v) {
+; NOPAIR-LABEL: fhalf:
+; NOPAIR: // %bb.0:
+; NOPAIR-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; NOPAIR-NEXT: addvl sp, sp, #-18
+; NOPAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
+; NOPAIR-NEXT: .cfi_offset w30, -8
+; NOPAIR-NEXT: .cfi_offset w29, -16
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; NOPAIR-NEXT: bl my_func
+; NOPAIR-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: addvl sp, sp, #18
+; NOPAIR-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; NOPAIR-NEXT: ret
+;
+; PAIR-LABEL: fhalf:
+; PAIR: // %bb.0:
+; PAIR-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; PAIR-NEXT: addvl sp, sp, #-18
+; PAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: ptrue pn8.b
+; PAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
+; PAIR-NEXT: .cfi_offset w30, -8
+; PAIR-NEXT: .cfi_offset w29, -16
+; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; PAIR-NEXT: bl my_func
+; PAIR-NEXT: ptrue pn8.b
+; PAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: addvl sp, sp, #18
+; PAIR-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; PAIR-NEXT: ret
+ call void @my_func()
+ ret void
+}
+
+;; Do NOT group Z10
+;; DO group Z8 and Z9 and save P8
+define aarch64_sve_vector_pcs void @test_clobbers_z_p_regs() {
+; NOPAIR-LABEL: test_clobbers_z_p_regs:
+; NOPAIR: // %bb.0:
+; NOPAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; NOPAIR-NEXT: addvl sp, sp, #-4
+; NOPAIR-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z8, [sp, #3, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
+; NOPAIR-NEXT: .cfi_offset w29, -16
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; NOPAIR-NEXT: //APP
+; NOPAIR-NEXT: //NO_APP
+; NOPAIR-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: addvl sp, sp, #4
+; NOPAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; NOPAIR-NEXT: ret
+;
+; PAIR-LABEL: test_clobbers_z_p_regs:
+; PAIR: // %bb.0:
+; PAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; PAIR-NEXT: addvl sp, sp, #-4
+; PAIR-NEXT: str p8, [sp, #5, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: ptrue pn8.b
+; PAIR-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
+; PAIR-NEXT: .cfi_offset w29, -16
+; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; PAIR-NEXT: //APP
+; PAIR-NEXT: //NO_APP
+; PAIR-NEXT: ptrue pn8.b
+; PAIR-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ldr p8, [sp, #5, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: addvl sp, sp, #4
+; PAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; PAIR-NEXT: ret
+ call void asm sideeffect "", "~{p4},~{p5},~{z8},~{z9},~{z10}"()
+ ret void
+}
+
+;; Do NOT group Z10
+;; DO group Z8 and Z9 and use P9
+define aarch64_sve_vector_pcs void @test_clobbers_z_p_regs2() {
+; NOPAIR-LABEL: test_clobbers_z_p_regs2:
+; NOPAIR: // %bb.0:
+; NOPAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; NOPAIR-NEXT: addvl sp, sp, #-4
+; NOPAIR-NEXT: str p10, [sp, #6, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p9, [sp, #7, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z8, [sp, #3, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
+; NOPAIR-NEXT: .cfi_offset w29, -16
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; NOPAIR-NEXT: //APP
+; NOPAIR-NEXT: //NO_APP
+; NOPAIR-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr p10, [sp, #6, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: addvl sp, sp, #4
+; NOPAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; NOPAIR-NEXT: ret
+;
+; PAIR-LABEL: test_clobbers_z_p_regs2:
+; PAIR: // %bb.0:
+; PAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; PAIR-NEXT: addvl sp, sp, #-4
+; PAIR-NEXT: str p9, [sp, #7, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: ptrue pn9.b
+; PAIR-NEXT: str p10, [sp, #6, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT: st1b { z8.b, z9.b }, pn9, [sp, #4, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
+; PAIR-NEXT: .cfi_offset w29, -16
+; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; PAIR-NEXT: //APP
+; PAIR-NEXT: //NO_APP
+; PAIR-NEXT: ptrue pn9.b
+; PAIR-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT: ldr p10, [sp, #6, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ld1b { z8.b, z9.b }, pn9/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: addvl sp, sp, #4
+; PAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; PAIR-NEXT: ret
+ call void asm sideeffect "", "~{p9},~{p10},~{z8},~{z9},~{z10}"()
+ ret void
+}
+
+
+;; Test order of PRegs and ZRegs when there is no PReg being clobbered
+define aarch64_sve_vector_pcs void @test_clobbers_z_regs() {
+; NOPAIR-LABEL: test_clobbers_z_regs:
+; NOPAIR: // %bb.0:
+; NOPAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; NOPAIR-NEXT: addvl sp, sp, #-2
+; NOPAIR-NEXT: str z9, [sp] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; NOPAIR-NEXT: .cfi_offset w29, -16
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; NOPAIR-NEXT: //APP
+; NOPAIR-NEXT: //NO_APP
+; NOPAIR-NEXT: ldr z9, [sp] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: addvl sp, sp, #2
+; NOPAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; NOPAIR-NEXT: ret
+;
+; PAIR-LABEL: test_clobbers_z_regs:
+; PAIR: // %bb.0:
+; PAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; PAIR-NEXT: addvl sp, sp, #-3
+; PAIR-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: ptrue pn8.b
+; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; PAIR-NEXT: .cfi_offset w29, -16
+; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; PAIR-NEXT: //APP
+; PAIR-NEXT: //NO_APP
+; PAIR-NEXT: ptrue pn8.b
+; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: addvl sp, sp, #3
+; PAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; PAIR-NEXT: ret
+ call void asm sideeffect "", "~{z8},~{z9}"()
+ ret void
+}
+
+;; DO NOT group Z8 and Z9 and
+;; DO NOT save P8
+;; It does not belong to the allowed calling conventions
+;; NOPAIR and PAIR should have the same assembly
+define void @test_clobbers_z_regs_negative() {
+; NOPAIR-LABEL: test_clobbers_z_regs_negative:
+; NOPAIR: // %bb.0:
+; NOPAIR-NEXT: stp d9, d8, [sp, #-16]! // 16-byte Folded Spill
+; NOPAIR-NEXT: .cfi_def_cfa_offset 16
+; NOPAIR-NEXT: .cfi_offset b8, -8
+; NOPAIR-NEXT: .cfi_offset b9, -16
+; NOPAIR-NEXT: //APP
+; NOPAIR-NEXT: //NO_APP
+; NOPAIR-NEXT: ldp d9, d8, [sp], #16 // 16-byte Folded Reload
+; NOPAIR-NEXT: ret
+;
+; PAIR-LABEL: test_clobbers_z_regs_negative:
+; PAIR: // %bb.0:
+; PAIR-NEXT: stp d9, d8, [sp, #-16]! // 16-byte Folded Spill
+; PAIR-NEXT: .cfi_def_cfa_offset 16
+; PAIR-NEXT: .cfi_offset b8, -8
+; PAIR-NEXT: .cfi_offset b9, -16
+; PAIR-NEXT: //APP
+; PAIR-NEXT: //NO_APP
+; PAIR-NEXT: ldp d9, d8, [sp], #16 // 16-byte Folded Reload
+; PAIR-NEXT: ret
+ call void asm sideeffect "", "~{z8},~{z9}"()
+ ret void
+}
+
+;; Do NOT save P8 and NOT group any Z8 and Z10 register
+define aarch64_sve_vector_pcs void @test_clobbers_2_z_regs_negative() {
+; NOPAIR-LABEL: test_clobbers_2_z_regs_negative:
+; NOPAIR: // %bb.0:
+; NOPAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; NOPAIR-NEXT: addvl sp, sp, #-2
+; NOPAIR-NEXT: str z10, [sp] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; NOPAIR-NEXT: .cfi_offset w29, -16
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 16 * VG
+; NOPAIR-NEXT: //APP
+; NOPAIR-NEXT: //NO_APP
+; NOPAIR-NEXT: ldr z10, [sp] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: addvl sp, sp, #2
+; NOPAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; NOPAIR-NEXT: ret
+;
+; PAIR-LABEL: test_clobbers_2_z_regs_negative:
+; PAIR: // %bb.0:
+; PAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; PAIR-NEXT: addvl sp, sp, #-2
+; PAIR-NEXT: str z10, [sp] // 16-byte Folded Spill
+; PAIR-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; PAIR-NEXT: .cfi_offset w29, -16
+; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 16 * VG
+; PAIR-NEXT: //APP
+; PAIR-NEXT: //NO_APP
+; PAIR-NEXT: ldr z10, [sp] // 16-byte Folded Reload
+; PAIR-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT: addvl sp, sp, #2
+; PAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; PAIR-NEXT: ret
+ call void asm sideeffect "", "~{z8},~{z10}"()
+ ret void
+}
+
+
+;; NOTHING TO DO HERE
+;; There is no ZReg pairs to save
+define aarch64_sve_vector_pcs void @test_clobbers_p_reg_negative() {
+; NOPAIR-LABEL: test_clobbers_p_reg_negative:
+; NOPAIR: // %bb.0:
+; NOPAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; NOPAIR-NEXT: addvl sp, sp, #-1
+; NOPAIR-NEXT: str p10, [sp, #7, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; NOPAIR-NEXT: .cfi_offset w29, -16
+; NOPAIR-NEXT: //APP
+; NOPAIR-NEXT: //NO_APP
+; NOPAIR-NEXT: ldr p10, [sp, #7, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: addvl sp, sp, #1
+; NOPAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; NOPAIR-NEXT: ret
+;
+; PAIR-LABEL: test_clobbers_p_reg_negative:
+; PAIR: // %bb.0:
+; PAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; PAIR-NEXT: addvl sp, sp, #-1
+; PAIR-NEXT: str p10, [sp, #7, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; PAIR-NEXT: .cfi_offset w29, -16
+; PAIR-NEXT: //APP
+; PAIR-NEXT: //NO_APP
+; PAIR-NEXT: ldr p10, [sp, #7, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: addvl sp, sp, #1
+; PAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; PAIR-NEXT: ret
+ call void asm sideeffect "", "~{p10}"()
+ ret void
+}
More information about the llvm-commits
mailing list