[llvm] [LLVM][AArch64]Use load/store with consecutive registers in SME2 or S… (PR #77665)
via llvm-commits
llvm-commits at lists.llvm.org
Thu May 9 07:00:22 PDT 2024
https://github.com/CarolineConcatto updated https://github.com/llvm/llvm-project/pull/77665
>From 5e9b05b7a48ca5e4eb8c28db42417379d1b90993 Mon Sep 17 00:00:00 2001
From: Caroline Concatto <caroline.concatto at arm.com>
Date: Wed, 10 Jan 2024 17:23:04 +0000
Subject: [PATCH 01/11] [LLVM][AArch64]Use load/store with consecutive
registers in SME2 or SVE2.1 for spill/fill
When possible the spill/fill register in Frame Lowering uses the ld/st
consecutive pairs available in sme or sve2.1.
---
.../Target/AArch64/AArch64FrameLowering.cpp | 92 +-
.../AArch64/AArch64MachineFunctionInfo.h | 2 +
.../CodeGen/AArch64/sme2-intrinsics-ld1.ll | 1488 ++++++-----------
.../CodeGen/AArch64/sme2-intrinsics-ldnt1.ll | 1488 ++++++-----------
.../AArch64/sve-callee-save-restore-pairs.ll | 295 ++++
5 files changed, 1459 insertions(+), 1906 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 0e9adde564b3e..cb7ba61a1236e 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1509,6 +1509,11 @@ static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
switch (I->getOpcode()) {
default:
return false;
+ case AArch64::PTRUE_C_B:
+ case AArch64::LD1B_2Z_IMM:
+ case AArch64::ST1B_2Z_IMM:
+ return I->getMF()->getSubtarget<AArch64Subtarget>().hasSVE2p1() ||
+ I->getMF()->getSubtarget<AArch64Subtarget>().hasSME2();
case AArch64::STR_ZXI:
case AArch64::STR_PXI:
case AArch64::LDR_ZXI:
@@ -2782,6 +2787,16 @@ struct RegPairInfo {
} // end anonymous namespace
+unsigned findFreePredicateAsCounterReg(MachineFunction &MF) {
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ for (MCRegister PReg :
+ {AArch64::PN8, AArch64::PN9, AArch64::PN10, AArch64::PN11, AArch64::PN12,
+ AArch64::PN13, AArch64::PN14, AArch64::PN15}) {
+ if (!MRI.isReserved(PReg))
+ return PReg;
+ }
+ llvm_unreachable("cannot find a free predicate");
+}
static void computeCalleeSaveRegisterPairs(
MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
@@ -2792,6 +2807,7 @@ static void computeCalleeSaveRegisterPairs(
bool IsWindows = isTargetWindows(MF);
bool NeedsWinCFI = needsWinCFI(MF);
+ const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
MachineFrameInfo &MFI = MF.getFrameInfo();
CallingConv::ID CC = MF.getFunction().getCallingConv();
@@ -2860,7 +2876,11 @@ static void computeCalleeSaveRegisterPairs(
RPI.Reg2 = NextReg;
break;
case RegPairInfo::PPR:
+ break;
case RegPairInfo::ZPR:
+ if (Subtarget.hasSVE2p1() || Subtarget.hasSME2())
+ if (((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1))
+ RPI.Reg2 = NextReg;
break;
}
}
@@ -2905,7 +2925,7 @@ static void computeCalleeSaveRegisterPairs(
assert(OffsetPre % Scale == 0);
if (RPI.isScalable())
- ScalableByteOffset += StackFillDir * Scale;
+ ScalableByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
else
ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
@@ -2916,9 +2936,6 @@ static void computeCalleeSaveRegisterPairs(
(IsWindows && RPI.Reg2 == AArch64::LR)))
ByteOffset += StackFillDir * 8;
- assert(!(RPI.isScalable() && RPI.isPaired()) &&
- "Paired spill/fill instructions don't exist for SVE vectors");
-
// Round up size of non-pair to pair size if we need to pad the
// callee-save area to ensure 16-byte alignment.
if (NeedGapToAlignStack && !NeedsWinCFI &&
@@ -3005,6 +3022,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
}
return true;
}
+ bool PtrueCreated = false;
for (const RegPairInfo &RPI : llvm::reverse(RegPairs)) {
unsigned Reg1 = RPI.Reg1;
unsigned Reg2 = RPI.Reg2;
@@ -3039,10 +3057,10 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
Alignment = Align(16);
break;
case RegPairInfo::ZPR:
- StrOpc = AArch64::STR_ZXI;
- Size = 16;
- Alignment = Align(16);
- break;
+ StrOpc = RPI.isPaired() ? AArch64::ST1B_2Z_IMM : AArch64::STR_ZXI;
+ Size = 16;
+ Alignment = Align(16);
+ break;
case RegPairInfo::PPR:
StrOpc = AArch64::STR_PXI;
Size = 2;
@@ -3066,19 +3084,37 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
std::swap(Reg1, Reg2);
std::swap(FrameIdxReg1, FrameIdxReg2);
}
+
+ unsigned PnReg;
+ unsigned PairRegs;
+ if (RPI.isPaired() && RPI.isScalable()) {
+ PairRegs = AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0);
+ if (!PtrueCreated) {
+ PtrueCreated = true;
+ PnReg = findFreePredicateAsCounterReg(MF);
+ BuildMI(MBB, MI, DL, TII.get(AArch64::PTRUE_C_B), PnReg)
+ .setMIFlags(MachineInstr::FrameDestroy);
+ }
+ }
MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
if (!MRI.isReserved(Reg1))
MBB.addLiveIn(Reg1);
if (RPI.isPaired()) {
if (!MRI.isReserved(Reg2))
MBB.addLiveIn(Reg2);
- MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
+ if (RPI.isScalable())
+ MIB.addReg(PairRegs);
+ else
+ MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
MachineMemOperand::MOStore, Size, Alignment));
}
- MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
- .addReg(AArch64::SP)
+ if (RPI.isPaired() && RPI.isScalable())
+ MIB.addReg(PnReg);
+ else
+ MIB.addReg(Reg1, getPrologueDeath(MF, Reg1));
+ MIB.addReg(AArch64::SP)
.addImm(RPI.Offset) // [sp, #offset*scale],
// where factor*scale is implicit
.setMIFlag(MachineInstr::FrameSetup);
@@ -3090,8 +3126,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
// Update the StackIDs of the SVE stack slots.
MachineFrameInfo &MFI = MF.getFrameInfo();
- if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR)
- MFI.setStackID(RPI.FrameIdx, TargetStackID::ScalableVector);
+ if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR) {
+ MFI.setStackID(FrameIdxReg1, TargetStackID::ScalableVector);
+ if (RPI.isPaired())
+ MFI.setStackID(FrameIdxReg2, TargetStackID::ScalableVector);
+ }
}
return true;
@@ -3111,7 +3150,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
- auto EmitMI = [&](const RegPairInfo &RPI) -> MachineBasicBlock::iterator {
+ bool PtrueCreated = false;
+ auto EmitMI = [&, PtrueCreated = false](const RegPairInfo &RPI) mutable -> MachineBasicBlock::iterator {
unsigned Reg1 = RPI.Reg1;
unsigned Reg2 = RPI.Reg2;
@@ -3143,7 +3183,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
Alignment = Align(16);
break;
case RegPairInfo::ZPR:
- LdrOpc = AArch64::LDR_ZXI;
+ LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI;
Size = 16;
Alignment = Align(16);
break;
@@ -3168,15 +3208,31 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
std::swap(Reg1, Reg2);
std::swap(FrameIdxReg1, FrameIdxReg2);
}
+
+ unsigned PnReg;
+ unsigned PairRegs;
+ if (RPI.isPaired() && RPI.isScalable()) {
+ PairRegs = AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0);
+ if (!PtrueCreated) {
+ PtrueCreated = true;
+ PnReg = findFreePredicateAsCounterReg(MF);
+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::PTRUE_C_B), PnReg)
+ .setMIFlags(MachineInstr::FrameDestroy);
+ }
+ }
+
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc));
if (RPI.isPaired()) {
- MIB.addReg(Reg2, getDefRegState(true));
+ MIB.addReg(RPI.isScalable() ? PairRegs : Reg2, getDefRegState(true));
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
MachineMemOperand::MOLoad, Size, Alignment));
}
- MIB.addReg(Reg1, getDefRegState(true))
- .addReg(AArch64::SP)
+ if (RPI.isPaired() && RPI.isScalable())
+ MIB.addReg(PnReg);
+ else
+ MIB.addReg(Reg1, getDefRegState(true));
+ MIB.addReg(AArch64::SP)
.addImm(RPI.Offset) // [sp, #offset*scale]
// where factor*scale is implicit
.setMIFlag(MachineInstr::FrameDestroy);
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index d5941e6284111..4502c3c3c1e03 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -307,6 +307,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
int FrameIdx = Info.getFrameIdx();
if (MFI.getStackID(FrameIdx) != TargetStackID::Default)
continue;
+ if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector)
+ continue;
int64_t Offset = MFI.getObjectOffset(FrameIdx);
int64_t ObjSize = MFI.getObjectSize(FrameIdx);
MinOffset = std::min<int64_t>(Offset, MinOffset);
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
index ea7808d73093e..3a94b0333e267 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
@@ -55,45 +55,31 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -103,20 +89,14 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -129,21 +109,15 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -210,45 +184,31 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0, x1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -258,20 +218,14 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -284,21 +238,15 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -366,45 +314,31 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -414,20 +348,14 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -440,21 +368,15 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -521,45 +443,31 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0, x1, lsl #1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -569,20 +477,14 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -595,21 +497,15 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -677,45 +573,31 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1w { z0.s, z8.s }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -725,20 +607,14 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -751,21 +627,15 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -832,45 +702,31 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1w { z0.s, z8.s }, pn8/z, [x0, x1, lsl #2]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -880,20 +736,14 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -906,21 +756,15 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -988,45 +832,31 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1d { z0.d, z8.d }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -1036,20 +866,14 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1062,21 +886,15 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1143,45 +961,31 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1d { z0.d, z8.d }, pn8/z, [x0, x1, lsl #3]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -1191,20 +995,14 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1217,21 +1015,15 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1301,46 +1093,32 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1351,19 +1129,14 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1380,20 +1153,15 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1466,46 +1234,32 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0, x1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1516,19 +1270,14 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1545,20 +1294,15 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1632,46 +1376,32 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1682,19 +1412,14 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1711,20 +1436,15 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1798,46 +1518,32 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0, x1, lsl #1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1848,19 +1554,14 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1877,20 +1578,15 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1964,46 +1660,32 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2014,19 +1696,14 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -2043,20 +1720,15 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2130,46 +1802,32 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0, x1, lsl #2]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2180,19 +1838,14 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -2209,20 +1862,15 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2296,46 +1944,32 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2346,19 +1980,14 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -2375,20 +2004,15 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2462,46 +2086,32 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0, x1, lsl #3]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2512,19 +2122,14 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -2541,20 +2146,15 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
index 7e2d28fbf7982..8ecb7c858c6a2 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
@@ -8,45 +8,31 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1b { z0.b, z8.b }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -56,20 +42,14 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -82,21 +62,15 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -116,45 +90,31 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1b { z0.b, z8.b }, pn8/z, [x0, x1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -164,20 +124,14 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -190,21 +144,15 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -225,45 +173,31 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1h { z0.h, z8.h }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -273,20 +207,14 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -299,21 +227,15 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -333,45 +255,31 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1h { z0.h, z8.h }, pn8/z, [x0, x1, lsl #1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -381,20 +289,14 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -407,21 +309,15 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -442,45 +338,31 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1w { z0.s, z8.s }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -490,20 +372,14 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -516,21 +392,15 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -550,45 +420,31 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1w { z0.s, z8.s }, pn8/z, [x0, x1, lsl #2]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -598,20 +454,14 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -624,21 +474,15 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -659,45 +503,31 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1d { z0.d, z8.d }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -707,20 +537,14 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -733,21 +557,15 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -767,45 +585,31 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1d { z0.d, z8.d }, pn8/z, [x0, x1, lsl #3]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -815,20 +619,14 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -841,21 +639,15 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -876,46 +668,32 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -926,19 +704,14 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -955,20 +728,15 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -992,46 +760,32 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0, x1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1042,19 +796,14 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1071,20 +820,15 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1109,46 +853,32 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1159,19 +889,14 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1188,20 +913,15 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1225,46 +945,32 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0, x1, lsl #1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1275,19 +981,14 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1304,20 +1005,15 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1342,46 +1038,32 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1392,19 +1074,14 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1421,20 +1098,15 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1458,46 +1130,32 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0, x1, lsl #2]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1508,19 +1166,14 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1537,20 +1190,15 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1575,46 +1223,32 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1625,19 +1259,14 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1654,20 +1283,15 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1691,46 +1315,32 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0, x1, lsl #3]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1741,19 +1351,14 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1770,20 +1375,15 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
new file mode 100644
index 0000000000000..e53ed1777979c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
@@ -0,0 +1,295 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s --check-prefixes=NOPAIR
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=NOPAIR
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=PAIR
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=PAIR
+
+
+declare void @my_func()
+
+define void @fbyte(<vscale x 16 x i8> %v) {
+; NOPAIR-LABEL: fbyte:
+; NOPAIR: // %bb.0:
+; NOPAIR-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; NOPAIR-NEXT: addvl sp, sp, #-18
+; NOPAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
+; NOPAIR-NEXT: .cfi_offset w30, -8
+; NOPAIR-NEXT: .cfi_offset w29, -16
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; NOPAIR-NEXT: bl my_func
+; NOPAIR-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: addvl sp, sp, #18
+; NOPAIR-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; NOPAIR-NEXT: ret
+;
+; PAIR-LABEL: fbyte:
+; PAIR: // %bb.0:
+; PAIR-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; PAIR-NEXT: addvl sp, sp, #-18
+; PAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: ptrue pn8.b
+; PAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
+; PAIR-NEXT: .cfi_offset w30, -8
+; PAIR-NEXT: .cfi_offset w29, -16
+; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; PAIR-NEXT: bl my_func
+; PAIR-NEXT: ptrue pn8.b
+; PAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: addvl sp, sp, #18
+; PAIR-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; PAIR-NEXT: ret
+ call void @my_func()
+ ret void
+}
+
+define void @fhalf(<vscale x 8 x half> %v) {
+; NOPAIR-LABEL: fhalf:
+; NOPAIR: // %bb.0:
+; NOPAIR-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; NOPAIR-NEXT: addvl sp, sp, #-18
+; NOPAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
+; NOPAIR-NEXT: .cfi_offset w30, -8
+; NOPAIR-NEXT: .cfi_offset w29, -16
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; NOPAIR-NEXT: bl my_func
+; NOPAIR-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: addvl sp, sp, #18
+; NOPAIR-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; NOPAIR-NEXT: ret
+;
+; PAIR-LABEL: fhalf:
+; PAIR: // %bb.0:
+; PAIR-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; PAIR-NEXT: addvl sp, sp, #-18
+; PAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: ptrue pn8.b
+; PAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
+; PAIR-NEXT: .cfi_offset w30, -8
+; PAIR-NEXT: .cfi_offset w29, -16
+; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; PAIR-NEXT: bl my_func
+; PAIR-NEXT: ptrue pn8.b
+; PAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: addvl sp, sp, #18
+; PAIR-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; PAIR-NEXT: ret
+ call void @my_func()
+ ret void
+}
+
+
+
>From fff3e347753f84dac18e68624cc0026adb7887d2 Mon Sep 17 00:00:00 2001
From: Caroline Concatto <caroline.concatto at arm.com>
Date: Wed, 10 Jan 2024 17:23:04 +0000
Subject: [PATCH 02/11] [LLVM][AArch64]Use load/store with consecutive
registers in SME2 or SVE2.1 for spill/fill
When possible the spill/fill register in Frame Lowering uses the ld/st
consecutive pairs available in sme or sve2.1.
---
.../Target/AArch64/AArch64FrameLowering.cpp | 29 +++++++++++--------
.../AArch64/sve-callee-save-restore-pairs.ll | 3 --
2 files changed, 17 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index cb7ba61a1236e..c45630f4d9bd1 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2762,7 +2762,7 @@ struct RegPairInfo {
unsigned Reg2 = AArch64::NoRegister;
int FrameIdx;
int Offset;
- enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type;
+ enum RegType { GPR, FPR64, FPR128, ZPR, PPR } Type;
RegPairInfo() = default;
@@ -2787,16 +2787,22 @@ struct RegPairInfo {
} // end anonymous namespace
-unsigned findFreePredicateAsCounterReg(MachineFunction &MF) {
- const MachineRegisterInfo &MRI = MF.getRegInfo();
+static unsigned findFreePredicateAsCounterReg(MachineBasicBlock *MBB) {
+ MachineFunction *MF = MBB->getParent();
+
+ const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
+ const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
+ LivePhysRegs LiveRegs(TRI);
+ getLiveRegsForEntryMBB(LiveRegs, *MBB);
+
for (MCRegister PReg :
{AArch64::PN8, AArch64::PN9, AArch64::PN10, AArch64::PN11, AArch64::PN12,
- AArch64::PN13, AArch64::PN14, AArch64::PN15}) {
- if (!MRI.isReserved(PReg))
+ AArch64::PN13, AArch64::PN14, AArch64::PN15}){
return PReg;
}
- llvm_unreachable("cannot find a free predicate");
+ llvm_unreachable("No predicated register free");
}
+
static void computeCalleeSaveRegisterPairs(
MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
@@ -3085,17 +3091,18 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
std::swap(FrameIdxReg1, FrameIdxReg2);
}
- unsigned PnReg;
unsigned PairRegs;
+ unsigned PnReg;
if (RPI.isPaired() && RPI.isScalable()) {
PairRegs = AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0);
if (!PtrueCreated) {
PtrueCreated = true;
- PnReg = findFreePredicateAsCounterReg(MF);
+ PnReg = findFreePredicateAsCounterReg(&MBB);
BuildMI(MBB, MI, DL, TII.get(AArch64::PTRUE_C_B), PnReg)
- .setMIFlags(MachineInstr::FrameDestroy);
+ .setMIFlags(MachineInstr::FrameSetup);
}
}
+
MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
if (!MRI.isReserved(Reg1))
MBB.addLiveIn(Reg1);
@@ -3149,8 +3156,6 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
DL = MBBI->getDebugLoc();
computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
-
- bool PtrueCreated = false;
auto EmitMI = [&, PtrueCreated = false](const RegPairInfo &RPI) mutable -> MachineBasicBlock::iterator {
unsigned Reg1 = RPI.Reg1;
unsigned Reg2 = RPI.Reg2;
@@ -3215,7 +3220,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
PairRegs = AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0);
if (!PtrueCreated) {
PtrueCreated = true;
- PnReg = findFreePredicateAsCounterReg(MF);
+ PnReg = findFreePredicateAsCounterReg(&MBB);
BuildMI(MBB, MBBI, DL, TII.get(AArch64::PTRUE_C_B), PnReg)
.setMIFlags(MachineInstr::FrameDestroy);
}
diff --git a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
index e53ed1777979c..88a09ee2862df 100644
--- a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
+++ b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
@@ -290,6 +290,3 @@ define void @fhalf(<vscale x 8 x half> %v) {
call void @my_func()
ret void
}
-
-
-
>From f61f7bcb8d171b05c6b7c830c90c5c24fbbdaebf Mon Sep 17 00:00:00 2001
From: Caroline Concatto <caroline.concatto at arm.com>
Date: Mon, 19 Feb 2024 17:10:59 +0000
Subject: [PATCH 03/11] Address review comments
---
.../Target/AArch64/AArch64FrameLowering.cpp | 26 +++++--------------
1 file changed, 7 insertions(+), 19 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index c45630f4d9bd1..56d8603b4176b 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2762,7 +2762,7 @@ struct RegPairInfo {
unsigned Reg2 = AArch64::NoRegister;
int FrameIdx;
int Offset;
- enum RegType { GPR, FPR64, FPR128, ZPR, PPR } Type;
+ enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type;
RegPairInfo() = default;
@@ -2787,22 +2787,6 @@ struct RegPairInfo {
} // end anonymous namespace
-static unsigned findFreePredicateAsCounterReg(MachineBasicBlock *MBB) {
- MachineFunction *MF = MBB->getParent();
-
- const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
- const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
- LivePhysRegs LiveRegs(TRI);
- getLiveRegsForEntryMBB(LiveRegs, *MBB);
-
- for (MCRegister PReg :
- {AArch64::PN8, AArch64::PN9, AArch64::PN10, AArch64::PN11, AArch64::PN12,
- AArch64::PN13, AArch64::PN14, AArch64::PN15}){
- return PReg;
- }
- llvm_unreachable("No predicated register free");
-}
-
static void computeCalleeSaveRegisterPairs(
MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
@@ -3097,7 +3081,9 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
PairRegs = AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0);
if (!PtrueCreated) {
PtrueCreated = true;
- PnReg = findFreePredicateAsCounterReg(&MBB);
+ // Any one of predicate-as-count will be free to use
+ // This can be replaced in the future if needed
+ PnReg = AArch64::PN8;
BuildMI(MBB, MI, DL, TII.get(AArch64::PTRUE_C_B), PnReg)
.setMIFlags(MachineInstr::FrameSetup);
}
@@ -3220,7 +3206,9 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
PairRegs = AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0);
if (!PtrueCreated) {
PtrueCreated = true;
- PnReg = findFreePredicateAsCounterReg(&MBB);
+ // Any one of predicate-as-count will be free to use
+ // This can be replaced in the future if needed
+ PnReg = AArch64::PN8;
BuildMI(MBB, MBBI, DL, TII.get(AArch64::PTRUE_C_B), PnReg)
.setMIFlags(MachineInstr::FrameDestroy);
}
>From 94f21b1f6fa00e94729e9f814b97872c28d859a8 Mon Sep 17 00:00:00 2001
From: Caroline Concatto <caroline.concatto at arm.com>
Date: Mon, 19 Feb 2024 17:20:32 +0000
Subject: [PATCH 04/11] Remove not needed test from
AArch64MachineFunctionInfo.h
---
llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h | 2 --
1 file changed, 2 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 4502c3c3c1e03..d5941e6284111 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -307,8 +307,6 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
int FrameIdx = Info.getFrameIdx();
if (MFI.getStackID(FrameIdx) != TargetStackID::Default)
continue;
- if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector)
- continue;
int64_t Offset = MFI.getObjectOffset(FrameIdx);
int64_t ObjSize = MFI.getObjectSize(FrameIdx);
MinOffset = std::min<int64_t>(Offset, MinOffset);
>From 19a8ab66ebaacaf9386b1ae836addfc90d7c5ca0 Mon Sep 17 00:00:00 2001
From: Caroline Concatto <caroline.concatto at arm.com>
Date: Tue, 20 Feb 2024 08:45:39 +0000
Subject: [PATCH 05/11] Use assert in IsSVECalleeSave for the ld/st/ptrue
---
llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 56d8603b4176b..4cc5a78089ef4 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1512,8 +1512,9 @@ static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
case AArch64::PTRUE_C_B:
case AArch64::LD1B_2Z_IMM:
case AArch64::ST1B_2Z_IMM:
- return I->getMF()->getSubtarget<AArch64Subtarget>().hasSVE2p1() ||
- I->getMF()->getSubtarget<AArch64Subtarget>().hasSME2();
+ assert((I->getMF()->getSubtarget<AArch64Subtarget>().hasSVE2p1() ||
+ I->getMF()->getSubtarget<AArch64Subtarget>().hasSME2()) &&
+ "Expected SME2 or SVE2.1 Targer Architecture.");
case AArch64::STR_ZXI:
case AArch64::STR_PXI:
case AArch64::LDR_ZXI:
>From b18f3a629795c1266894368ae1d273359ea2d2a6 Mon Sep 17 00:00:00 2001
From: CarolineConcatto <caroline.concatto at arm.com>
Date: Tue, 20 Feb 2024 17:42:09 +0000
Subject: [PATCH 06/11] Update AArch64FrameLowering.cpp
Add missing ;
---
llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index f5d83f9669100..8768728725500 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -3157,7 +3157,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
auto ZPREnd = std::find_if(RegPairs.rbegin(), RegPairs.rend(), IsZPR);
std::reverse(ZPRBegin, ZPREnd.base());
- bool PtrueCreated = false
+ bool PtrueCreated = false;
for (const RegPairInfo &RPI : RegPairs) {
unsigned Reg1 = RPI.Reg1;
unsigned Reg2 = RPI.Reg2;
>From ecb0f5700686e1c37e5c86df090ac92a228a8d40 Mon Sep 17 00:00:00 2001
From: Caroline Concatto <caroline.concatto at arm.com>
Date: Wed, 13 Mar 2024 11:38:07 +0000
Subject: [PATCH 07/11] Save predicate a register used for save and restore
---
.../Target/AArch64/AArch64FrameLowering.cpp | 157 ++++++++++++------
.../AArch64/AArch64MachineFunctionInfo.h | 9 +
.../AArch64/sve-callee-save-restore-pairs.ll | 156 +++++++++++++++++
3 files changed, 273 insertions(+), 49 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index f15c93201457c..425a423bcba99 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1511,9 +1511,6 @@ static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
case AArch64::PTRUE_C_B:
case AArch64::LD1B_2Z_IMM:
case AArch64::ST1B_2Z_IMM:
- assert((I->getMF()->getSubtarget<AArch64Subtarget>().hasSVE2p1() ||
- I->getMF()->getSubtarget<AArch64Subtarget>().hasSME2()) &&
- "Expected SME2 or SVE2.1 Targer Architecture.");
case AArch64::STR_ZXI:
case AArch64::STR_PXI:
case AArch64::LDR_ZXI:
@@ -2787,6 +2784,28 @@ struct RegPairInfo {
} // end anonymous namespace
+static unsigned getPredicateAsCounterReg(unsigned Reg) {
+ switch (Reg) {
+ case AArch64::P8:
+ return AArch64::PN8;
+ case AArch64::P9:
+ return AArch64::PN9;
+ case AArch64::P10:
+ return AArch64::PN10;
+ case AArch64::P11:
+ return AArch64::PN11;
+ case AArch64::P12:
+ return AArch64::PN12;
+ case AArch64::P13:
+ return AArch64::PN13;
+ case AArch64::P14:
+ return AArch64::PN14;
+ case AArch64::P15:
+ return AArch64::PN15;
+ }
+ return 0;
+}
+
static void computeCalleeSaveRegisterPairs(
MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
@@ -3075,48 +3094,57 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
std::swap(FrameIdxReg1, FrameIdxReg2);
}
- unsigned PairRegs;
- unsigned PnReg;
if (RPI.isPaired() && RPI.isScalable()) {
- PairRegs = AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0);
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ unsigned PnReg = AFI->getPredicateRegForFillSpill();
if (!PtrueCreated) {
PtrueCreated = true;
- // Any one of predicate-as-count will be free to use
- // This can be replaced in the future if needed
- PnReg = AArch64::PN8;
BuildMI(MBB, MI, DL, TII.get(AArch64::PTRUE_C_B), PnReg)
.setMIFlags(MachineInstr::FrameSetup);
}
- }
-
- MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
- if (!MRI.isReserved(Reg1))
- MBB.addLiveIn(Reg1);
- if (RPI.isPaired()) {
+ MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
+ if (!MRI.isReserved(Reg1))
+ MBB.addLiveIn(Reg1);
if (!MRI.isReserved(Reg2))
MBB.addLiveIn(Reg2);
- if (RPI.isScalable())
- MIB.addReg(PairRegs);
- else
- MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
+ unsigned PairRegs = AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0);
+ MIB.addReg(PairRegs);
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
MachineMemOperand::MOStore, Size, Alignment));
- }
- if (RPI.isPaired() && RPI.isScalable())
MIB.addReg(PnReg);
- else
- MIB.addReg(Reg1, getPrologueDeath(MF, Reg1));
- MIB.addReg(AArch64::SP)
- .addImm(RPI.Offset) // [sp, #offset*scale],
- // where factor*scale is implicit
- .setMIFlag(MachineInstr::FrameSetup);
- MIB.addMemOperand(MF.getMachineMemOperand(
- MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
- MachineMemOperand::MOStore, Size, Alignment));
- if (NeedsWinCFI)
- InsertSEH(MIB, TII, MachineInstr::FrameSetup);
-
+ MIB.addReg(AArch64::SP)
+ .addImm(RPI.Offset) // [sp, #offset*scale],
+ // where factor*scale is implicit
+ .setMIFlag(MachineInstr::FrameSetup);
+ MIB.addMemOperand(MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
+ MachineMemOperand::MOStore, Size, Alignment));
+ if (NeedsWinCFI)
+ InsertSEH(MIB, TII, MachineInstr::FrameSetup);
+ } else { // The code when the pair of ZReg is not present
+ MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
+ if (!MRI.isReserved(Reg1))
+ MBB.addLiveIn(Reg1);
+ if (RPI.isPaired()) {
+ if (!MRI.isReserved(Reg2))
+ MBB.addLiveIn(Reg2);
+ MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
+ MIB.addMemOperand(MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
+ MachineMemOperand::MOStore, Size, Alignment));
+ }
+ MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
+ .addReg(AArch64::SP)
+ .addImm(RPI.Offset) // [sp, #offset*scale],
+ // where factor*scale is implicit
+ .setMIFlag(MachineInstr::FrameSetup);
+ MIB.addMemOperand(MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
+ MachineMemOperand::MOStore, Size, Alignment));
+ if (NeedsWinCFI)
+ InsertSEH(MIB, TII, MachineInstr::FrameSetup);
+ }
// Update the StackIDs of the SVE stack slots.
MachineFrameInfo &MFI = MF.getFrameInfo();
if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR) {
@@ -3124,7 +3152,6 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
if (RPI.isPaired())
MFI.setStackID(FrameIdxReg2, TargetStackID::ScalableVector);
}
-
}
return true;
}
@@ -3222,30 +3249,38 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
std::swap(FrameIdxReg1, FrameIdxReg2);
}
- unsigned PnReg;
- unsigned PairRegs;
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
if (RPI.isPaired() && RPI.isScalable()) {
- PairRegs = AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0);
+ unsigned PnReg = AFI->getPredicateRegForFillSpill();
if (!PtrueCreated) {
PtrueCreated = true;
- // Any one of predicate-as-count will be free to use
- // This can be replaced in the future if needed
- PnReg = AArch64::PN8;
BuildMI(MBB, MBBI, DL, TII.get(AArch64::PTRUE_C_B), PnReg)
.setMIFlags(MachineInstr::FrameDestroy);
}
- }
-
- MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc));
- if (RPI.isPaired()) {
- MIB.addReg(RPI.isScalable() ? PairRegs : Reg2, getDefRegState(true));
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc));
+ unsigned PairRegs = AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0);
+ MIB.addReg(PairRegs, getDefRegState(true));
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
MachineMemOperand::MOLoad, Size, Alignment));
- }
- if (RPI.isPaired() && RPI.isScalable())
MIB.addReg(PnReg);
- else
+ MIB.addReg(AArch64::SP)
+ .addImm(RPI.Offset) // [sp, #offset*scale]
+ // where factor*scale is implicit
+ .setMIFlag(MachineInstr::FrameDestroy);
+ MIB.addMemOperand(MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
+ MachineMemOperand::MOLoad, Size, Alignment));
+ if (NeedsWinCFI)
+ InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
+ } else {
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc));
+ if (RPI.isPaired()) {
+ MIB.addReg(Reg2, getDefRegState(true));
+ MIB.addMemOperand(MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
+ MachineMemOperand::MOLoad, Size, Alignment));
+ }
MIB.addReg(Reg1, getDefRegState(true));
MIB.addReg(AArch64::SP)
.addImm(RPI.Offset) // [sp, #offset*scale]
@@ -3256,8 +3291,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
MachineMemOperand::MOLoad, Size, Alignment));
if (NeedsWinCFI)
InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
+ }
}
-
return true;
}
@@ -3286,6 +3321,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
unsigned ExtraCSSpill = 0;
bool HasUnpairedGPR64 = false;
+ bool HasPairZReg = false;
// Figure out which callee-saved registers to save/restore.
for (unsigned i = 0; CSRegs[i]; ++i) {
const unsigned Reg = CSRegs[i];
@@ -3339,6 +3375,29 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
!RegInfo->isReservedReg(MF, PairedReg))
ExtraCSSpill = PairedReg;
}
+
+ // Save PReg in FunctionInfo to build PTRUE instruction later. The PTRUE is
+ // being used in the function to save and restore the pair of ZReg
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ if (Subtarget.hasSVE2p1() || Subtarget.hasSME2()) {
+ if (AArch64::PPRRegClass.contains(Reg) &&
+ (Reg > AArch64::P8 || Reg < AArch64::P15) && SavedRegs.test(Reg) &&
+ AFI->getPredicateRegForFillSpill() == 0)
+ AFI->setPredicateRegForFillSpill(getPredicateAsCounterReg(Reg));
+
+ // Check if there is a pair of ZRegs, so it can select P8 to create PTRUE,
+ // in case there is no PRege being saved(above)
+ HasPairZReg =
+ HasPairZReg || (AArch64::ZPRRegClass.contains(Reg, CSRegs[i ^ 1]) &&
+ SavedRegs.test(CSRegs[i ^ 1]));
+ }
+ }
+
+ // Make sure there is a PReg saved to be used in save and restore when there
+ // is ZReg pair.
+ if (AFI->getPredicateRegForFillSpill() == 0 && HasPairZReg) {
+ SavedRegs.set(AArch64::P8);
+ AFI->setPredicateRegForFillSpill(AArch64::PN8);
}
if (MF.getFunction().getCallingConv() == CallingConv::Win64 &&
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index d5941e6284111..2ab8b9f39d3a2 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -212,6 +212,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
// on function entry to record the initial pstate of a function.
Register PStateSMReg = MCRegister::NoRegister;
+ // Has the PNReg used to build PTRUE instruction.
+ // The PTRUE is used for the LD/ST of ZReg pairs in save and restore.
+ unsigned PredicateRegForFillSpill = 0;
+
public:
AArch64FunctionInfo(const Function &F, const AArch64Subtarget *STI);
@@ -220,6 +224,11 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
const override;
+ void setPredicateRegForFillSpill(unsigned Reg) {
+ PredicateRegForFillSpill = Reg;
+ }
+ unsigned getPredicateRegForFillSpill() { return PredicateRegForFillSpill; }
+
Register getPStateSMReg() const { return PStateSMReg; };
void setPStateSMReg(Register Reg) { PStateSMReg = Reg; };
diff --git a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
index 88a09ee2862df..04cb333f85b66 100644
--- a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
+++ b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
@@ -290,3 +290,159 @@ define void @fhalf(<vscale x 8 x half> %v) {
call void @my_func()
ret void
}
+
+;; Do NOT save P8 and NOT group any Z8 and Z10 register
+define void @test_clobbers_2_z_regs_(<vscale x 16 x i8> %v) {
+; NOPAIR-LABEL: test_clobbers_2_z_regs_:
+; NOPAIR: // %bb.0:
+; NOPAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; NOPAIR-NEXT: addvl sp, sp, #-2
+; NOPAIR-NEXT: str z10, [sp] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; NOPAIR-NEXT: .cfi_offset w29, -16
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 16 * VG
+; NOPAIR-NEXT: //APP
+; NOPAIR-NEXT: //NO_APP
+; NOPAIR-NEXT: ldr z10, [sp] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: addvl sp, sp, #2
+; NOPAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; NOPAIR-NEXT: ret
+;
+; PAIR-LABEL: test_clobbers_2_z_regs_:
+; PAIR: // %bb.0:
+; PAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; PAIR-NEXT: addvl sp, sp, #-2
+; PAIR-NEXT: str z10, [sp] // 16-byte Folded Spill
+; PAIR-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; PAIR-NEXT: .cfi_offset w29, -16
+; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 16 * VG
+; PAIR-NEXT: //APP
+; PAIR-NEXT: //NO_APP
+; PAIR-NEXT: ldr z10, [sp] // 16-byte Folded Reload
+; PAIR-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT: addvl sp, sp, #2
+; PAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; PAIR-NEXT: ret
+ call void asm sideeffect "", "~{z8},~{z10}"()
+ ret void
+}
+
+;; Do NOT group Z10
+;; DO group Z8 and Z9 and save P8
+define void @test_clobbers_z_p_regs(<vscale x 16 x i8> %v) {
+; NOPAIR-LABEL: test_clobbers_z_p_regs:
+; NOPAIR: // %bb.0:
+; NOPAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; NOPAIR-NEXT: addvl sp, sp, #-4
+; NOPAIR-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z8, [sp, #3, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
+; NOPAIR-NEXT: .cfi_offset w29, -16
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; NOPAIR-NEXT: //APP
+; NOPAIR-NEXT: //NO_APP
+; NOPAIR-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: addvl sp, sp, #4
+; NOPAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; NOPAIR-NEXT: ret
+;
+; PAIR-LABEL: test_clobbers_z_p_regs:
+; PAIR: // %bb.0:
+; PAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; PAIR-NEXT: addvl sp, sp, #-4
+; PAIR-NEXT: str p8, [sp, #5, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: ptrue pn8.b
+; PAIR-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
+; PAIR-NEXT: .cfi_offset w29, -16
+; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; PAIR-NEXT: //APP
+; PAIR-NEXT: //NO_APP
+; PAIR-NEXT: ptrue pn8.b
+; PAIR-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ldr p8, [sp, #5, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: addvl sp, sp, #4
+; PAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; PAIR-NEXT: ret
+ call void asm sideeffect "", "~{p4},~{p5},~{z8},~{z9},~{z10}"()
+ ret void
+}
+
+;; Do NOT group Z10
+;; DO group Z8 and Z9 and use P9
+define void @test_clobbers_z_p_regs2(<vscale x 16 x i8> %v) {
+; NOPAIR-LABEL: test_clobbers_z_p_regs2:
+; NOPAIR: // %bb.0:
+; NOPAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; NOPAIR-NEXT: addvl sp, sp, #-4
+; NOPAIR-NEXT: str p10, [sp, #6, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str p9, [sp, #7, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z8, [sp, #3, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
+; NOPAIR-NEXT: .cfi_offset w29, -16
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; NOPAIR-NEXT: //APP
+; NOPAIR-NEXT: //NO_APP
+; NOPAIR-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr p10, [sp, #6, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload
+; NOPAIR-NEXT: addvl sp, sp, #4
+; NOPAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; NOPAIR-NEXT: ret
+;
+; PAIR-LABEL: test_clobbers_z_p_regs2:
+; PAIR: // %bb.0:
+; PAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; PAIR-NEXT: addvl sp, sp, #-4
+; PAIR-NEXT: str p9, [sp, #7, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: ptrue pn9.b
+; PAIR-NEXT: str p10, [sp, #6, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT: st1b { z8.b, z9.b }, pn9, [sp, #4, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
+; PAIR-NEXT: .cfi_offset w29, -16
+; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; PAIR-NEXT: //APP
+; PAIR-NEXT: //NO_APP
+; PAIR-NEXT: ptrue pn9.b
+; PAIR-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT: ldr p10, [sp, #6, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ld1b { z8.b, z9.b }, pn9/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: addvl sp, sp, #4
+; PAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; PAIR-NEXT: ret
+ call void asm sideeffect "", "~{p9},~{p10},~{z8},~{z9},~{z10}"()
+ ret void
+}
+
>From c8bdbb963b73b9541d5c5514a50653dbc711583f Mon Sep 17 00:00:00 2001
From: Caroline Concatto <caroline.concatto at arm.com>
Date: Thu, 25 Apr 2024 16:58:10 +0000
Subject: [PATCH 08/11] Address comments about PN8 register use
---
.../Target/AArch64/AArch64FrameLowering.cpp | 52 +++++++------------
.../AArch64/AArch64MachineFunctionInfo.h | 4 +-
.../AArch64/sve-callee-save-restore-pairs.ll | 46 +++++++++++++++-
3 files changed, 66 insertions(+), 36 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 425a423bcba99..9b49ea9bdfca3 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2784,28 +2784,6 @@ struct RegPairInfo {
} // end anonymous namespace
-static unsigned getPredicateAsCounterReg(unsigned Reg) {
- switch (Reg) {
- case AArch64::P8:
- return AArch64::PN8;
- case AArch64::P9:
- return AArch64::PN9;
- case AArch64::P10:
- return AArch64::PN10;
- case AArch64::P11:
- return AArch64::PN11;
- case AArch64::P12:
- return AArch64::PN12;
- case AArch64::P13:
- return AArch64::PN13;
- case AArch64::P14:
- return AArch64::PN14;
- case AArch64::P15:
- return AArch64::PN15;
- }
- return 0;
-}
-
static void computeCalleeSaveRegisterPairs(
MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
@@ -2816,7 +2794,6 @@ static void computeCalleeSaveRegisterPairs(
bool IsWindows = isTargetWindows(MF);
bool NeedsWinCFI = needsWinCFI(MF);
- const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
MachineFrameInfo &MFI = MF.getFrameInfo();
CallingConv::ID CC = MF.getFunction().getCallingConv();
@@ -2887,7 +2864,7 @@ static void computeCalleeSaveRegisterPairs(
case RegPairInfo::PPR:
break;
case RegPairInfo::ZPR:
- if (Subtarget.hasSVE2p1() || Subtarget.hasSME2())
+ if (AFI->getPredicateRegForFillSpill() != 0)
if (((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1))
RPI.Reg2 = NextReg;
break;
@@ -3107,8 +3084,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
MBB.addLiveIn(Reg1);
if (!MRI.isReserved(Reg2))
MBB.addLiveIn(Reg2);
- unsigned PairRegs = AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0);
- MIB.addReg(PairRegs);
+ MIB.addReg(/*PairRegs*/ AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0));
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
MachineMemOperand::MOStore, Size, Alignment));
@@ -3258,8 +3234,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
.setMIFlags(MachineInstr::FrameDestroy);
}
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc));
- unsigned PairRegs = AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0);
- MIB.addReg(PairRegs, getDefRegState(true));
+ MIB.addReg(/*PairRegs*/ AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0),
+ getDefRegState(true));
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
MachineMemOperand::MOLoad, Size, Alignment));
@@ -3381,9 +3357,9 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
if (Subtarget.hasSVE2p1() || Subtarget.hasSME2()) {
if (AArch64::PPRRegClass.contains(Reg) &&
- (Reg > AArch64::P8 || Reg < AArch64::P15) && SavedRegs.test(Reg) &&
+ (Reg >= AArch64::P8 && Reg <= AArch64::P15) && SavedRegs.test(Reg) &&
AFI->getPredicateRegForFillSpill() == 0)
- AFI->setPredicateRegForFillSpill(getPredicateAsCounterReg(Reg));
+ AFI->setPredicateRegForFillSpill((Reg - AArch64::P0) + AArch64::PN0);
// Check if there is a pair of ZRegs, so it can select P8 to create PTRUE,
// in case there is no PRege being saved(above)
@@ -3395,10 +3371,18 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
// Make sure there is a PReg saved to be used in save and restore when there
// is ZReg pair.
- if (AFI->getPredicateRegForFillSpill() == 0 && HasPairZReg) {
- SavedRegs.set(AArch64::P8);
- AFI->setPredicateRegForFillSpill(AArch64::PN8);
- }
+ if ((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
+ (MF.getFunction().getCallingConv() ==
+ CallingConv::AArch64_SVE_VectorCall ||
+ MF.getFunction().getCallingConv() ==
+ CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0 ||
+ MF.getFunction().getCallingConv() ==
+ CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2))
+ if (AFI->getPredicateRegForFillSpill() == 0 && HasPairZReg) {
+ assert(!RegInfo->isReservedReg(MF, AArch64::P8) && "P8 is reserved");
+ SavedRegs.set(AArch64::P8);
+ AFI->setPredicateRegForFillSpill(AArch64::PN8);
+ }
if (MF.getFunction().getCallingConv() == CallingConv::Win64 &&
!Subtarget.isTargetWindows()) {
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 2ab8b9f39d3a2..df09fc5592edf 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -227,7 +227,9 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
void setPredicateRegForFillSpill(unsigned Reg) {
PredicateRegForFillSpill = Reg;
}
- unsigned getPredicateRegForFillSpill() { return PredicateRegForFillSpill; }
+ unsigned getPredicateRegForFillSpill() const {
+ return PredicateRegForFillSpill;
+ }
Register getPStateSMReg() const { return PStateSMReg; };
void setPStateSMReg(Register Reg) { PStateSMReg = Reg; };
diff --git a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
index 04cb333f85b66..a93c168624457 100644
--- a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
+++ b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
@@ -334,7 +334,7 @@ define void @test_clobbers_2_z_regs_(<vscale x 16 x i8> %v) {
;; Do NOT group Z10
;; DO group Z8 and Z9 and save P8
-define void @test_clobbers_z_p_regs(<vscale x 16 x i8> %v) {
+define aarch64_sve_vector_pcs void @test_clobbers_z_p_regs(<vscale x 16 x i8> %v) {
; NOPAIR-LABEL: test_clobbers_z_p_regs:
; NOPAIR: // %bb.0:
; NOPAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
@@ -446,3 +446,47 @@ define void @test_clobbers_z_p_regs2(<vscale x 16 x i8> %v) {
ret void
}
+
+;; DO NOT group Z8 and Z9 and
+;; DO NOT save P8
+;; It does not belong to the allowed calling conventions
+;; NOPAIR and PAIR should have the same assembly
+define void @test_clobbers_z_p_regs_negative(<vscale x 16 x i8> %v) {
+; NOPAIR-LABEL: test_clobbers_z_p_regs_negative:
+; NOPAIR: // %bb.0:
+; NOPAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; NOPAIR-NEXT: addvl sp, sp, #-2
+; NOPAIR-NEXT: str z9, [sp] // 16-byte Folded Spill
+; NOPAIR-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; NOPAIR-NEXT: .cfi_offset w29, -16
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; NOPAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; NOPAIR-NEXT: //APP
+; NOPAIR-NEXT: //NO_APP
+; NOPAIR-NEXT: ldr z9, [sp] // 16-byte Folded Reload
+; NOPAIR-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; NOPAIR-NEXT: addvl sp, sp, #2
+; NOPAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; NOPAIR-NEXT: ret
+;
+; PAIR-LABEL: test_clobbers_z_p_regs_negative:
+; PAIR: // %bb.0:
+; PAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; PAIR-NEXT: addvl sp, sp, #-2
+; PAIR-NEXT: str z9, [sp] // 16-byte Folded Spill
+; PAIR-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; PAIR-NEXT: .cfi_offset w29, -16
+; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; PAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; PAIR-NEXT: //APP
+; PAIR-NEXT: //NO_APP
+; PAIR-NEXT: ldr z9, [sp] // 16-byte Folded Reload
+; PAIR-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT: addvl sp, sp, #2
+; PAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; PAIR-NEXT: ret
+ call void asm sideeffect "", "~{z8},~{z9}"()
+ ret void
+}
>From 633fa858a4c1083450d9d0a50869ee12a702a538 Mon Sep 17 00:00:00 2001
From: Caroline Concatto <caroline.concatto at arm.com>
Date: Tue, 7 May 2024 14:38:20 +0000
Subject: [PATCH 09/11] Add asserts when using PnReg for ld/st with pair of
ZRegs
---
.../Target/AArch64/AArch64FrameLowering.cpp | 66 +++++++++----------
1 file changed, 32 insertions(+), 34 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 9b49ea9bdfca3..7b1622ce224bd 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2904,7 +2904,6 @@ static void computeCalleeSaveRegisterPairs(
if (NeedsWinCFI &&
RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair
RPI.FrameIdx = CSI[i + RegInc].getFrameIdx();
-
int Scale = RPI.getScale();
int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
@@ -3008,7 +3007,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
}
return true;
}
- bool PtrueCreated = false;
+ bool PTrueCreated = false;
for (const RegPairInfo &RPI : llvm::reverse(RegPairs)) {
unsigned Reg1 = RPI.Reg1;
unsigned Reg2 = RPI.Reg2;
@@ -3072,10 +3071,13 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
}
if (RPI.isPaired() && RPI.isScalable()) {
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
unsigned PnReg = AFI->getPredicateRegForFillSpill();
- if (!PtrueCreated) {
- PtrueCreated = true;
+ assert(((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) && PnReg != 0) &&
+ "Expects SVE2.1 or SME2 target and a predicate register");
+ if (!PTrueCreated) {
+ PTrueCreated = true;
BuildMI(MBB, MI, DL, TII.get(AArch64::PTRUE_C_B), PnReg)
.setMIFlags(MachineInstr::FrameSetup);
}
@@ -3145,7 +3147,6 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
DL = MBBI->getDebugLoc();
computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
-
if (homogeneousPrologEpilog(MF, &MBB)) {
auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
.setMIFlag(MachineInstr::FrameDestroy);
@@ -3166,7 +3167,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
auto ZPREnd = std::find_if_not(ZPRBegin, RegPairs.end(), IsZPR);
std::reverse(ZPRBegin, ZPREnd);
- bool PtrueCreated = false;
+ bool PTrueCreated = false;
for (const RegPairInfo &RPI : RegPairs) {
unsigned Reg1 = RPI.Reg1;
unsigned Reg2 = RPI.Reg2;
@@ -3227,9 +3228,12 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
if (RPI.isPaired() && RPI.isScalable()) {
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
unsigned PnReg = AFI->getPredicateRegForFillSpill();
- if (!PtrueCreated) {
- PtrueCreated = true;
+ assert(((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) && PnReg != 0) &&
+ "Expects SVE2.1 or SME2 target and a predicate register");
+ if (!PTrueCreated) {
+ PTrueCreated = true;
BuildMI(MBB, MBBI, DL, TII.get(AArch64::PTRUE_C_B), PnReg)
.setMIFlags(MachineInstr::FrameDestroy);
}
@@ -3351,39 +3355,33 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
!RegInfo->isReservedReg(MF, PairedReg))
ExtraCSSpill = PairedReg;
}
+ // Check if there is a pair of ZRegs, so it can select PReg for spill/fill
+ HasPairZReg |= (AArch64::ZPRRegClass.contains(Reg, CSRegs[i ^ 1]) &&
+ SavedRegs.test(CSRegs[i ^ 1]));
+ }
- // Save PReg in FunctionInfo to build PTRUE instruction later. The PTRUE is
- // being used in the function to save and restore the pair of ZReg
+ if (HasPairZReg && (Subtarget.hasSVE2p1() || Subtarget.hasSME2())) {
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
- if (Subtarget.hasSVE2p1() || Subtarget.hasSME2()) {
- if (AArch64::PPRRegClass.contains(Reg) &&
- (Reg >= AArch64::P8 && Reg <= AArch64::P15) && SavedRegs.test(Reg) &&
- AFI->getPredicateRegForFillSpill() == 0)
- AFI->setPredicateRegForFillSpill((Reg - AArch64::P0) + AArch64::PN0);
-
- // Check if there is a pair of ZRegs, so it can select P8 to create PTRUE,
- // in case there is no PRege being saved(above)
- HasPairZReg =
- HasPairZReg || (AArch64::ZPRRegClass.contains(Reg, CSRegs[i ^ 1]) &&
- SavedRegs.test(CSRegs[i ^ 1]));
+ // Find a suitable predicate register for the multi-vector spill/fill
+ // instructions.
+ for (unsigned PReg = AArch64::P8; PReg <= AArch64::P15; ++PReg) {
+ if (SavedRegs.test(PReg)) {
+ AFI->setPredicateRegForFillSpill(PReg - AArch64::P0 + AArch64::PN0);
+ break;
+ }
}
- }
-
- // Make sure there is a PReg saved to be used in save and restore when there
- // is ZReg pair.
- if ((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
- (MF.getFunction().getCallingConv() ==
- CallingConv::AArch64_SVE_VectorCall ||
- MF.getFunction().getCallingConv() ==
- CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0 ||
- MF.getFunction().getCallingConv() ==
- CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2))
- if (AFI->getPredicateRegForFillSpill() == 0 && HasPairZReg) {
- assert(!RegInfo->isReservedReg(MF, AArch64::P8) && "P8 is reserved");
+ // If no free callee-save has been found assign one.
+ if (!AFI->getPredicateRegForFillSpill() &&
+ MF.getFunction().getCallingConv() ==
+ CallingConv::AArch64_SVE_VectorCall) {
SavedRegs.set(AArch64::P8);
AFI->setPredicateRegForFillSpill(AArch64::PN8);
}
+ assert(!RegInfo->isReservedReg(MF, AFI->getPredicateRegForFillSpill()) &&
+ "Predicate cannot be a reserved register");
+ }
+
if (MF.getFunction().getCallingConv() == CallingConv::Win64 &&
!Subtarget.isTargetWindows()) {
// For Windows calling convention on a non-windows OS, where X18 is treated
>From 0b2c9f7860d3d54164ded438520d2dff6fae4957 Mon Sep 17 00:00:00 2001
From: Caroline Concatto <caroline.concatto at arm.com>
Date: Tue, 7 May 2024 15:03:59 +0000
Subject: [PATCH 10/11] fix format
---
.../Target/AArch64/AArch64FrameLowering.cpp | 20 +++++++++----------
1 file changed, 10 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 7b1622ce224bd..216f07c5f29f8 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -3128,7 +3128,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR) {
MFI.setStackID(FrameIdxReg1, TargetStackID::ScalableVector);
if (RPI.isPaired())
- MFI.setStackID(FrameIdxReg2, TargetStackID::ScalableVector);
+ MFI.setStackID(FrameIdxReg2, TargetStackID::ScalableVector);
}
}
return true;
@@ -3262,15 +3262,15 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
MachineMemOperand::MOLoad, Size, Alignment));
}
MIB.addReg(Reg1, getDefRegState(true));
- MIB.addReg(AArch64::SP)
- .addImm(RPI.Offset) // [sp, #offset*scale]
- // where factor*scale is implicit
- .setMIFlag(MachineInstr::FrameDestroy);
- MIB.addMemOperand(MF.getMachineMemOperand(
- MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
- MachineMemOperand::MOLoad, Size, Alignment));
- if (NeedsWinCFI)
- InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
+ MIB.addReg(AArch64::SP)
+ .addImm(RPI.Offset) // [sp, #offset*scale]
+ // where factor*scale is implicit
+ .setMIFlag(MachineInstr::FrameDestroy);
+ MIB.addMemOperand(MF.getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
+ MachineMemOperand::MOLoad, Size, Alignment));
+ if (NeedsWinCFI)
+ InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
}
}
return true;
>From a6d036b9b21d1c1a9981b90054efec73e21d4b96 Mon Sep 17 00:00:00 2001
From: Caroline Concatto <caroline.concatto at arm.com>
Date: Thu, 9 May 2024 13:54:23 +0000
Subject: [PATCH 11/11] Add checks for ZReg and PRegs order for spill/fill
---
.../Target/AArch64/AArch64FrameLowering.cpp | 35 +++++++++++++++----
1 file changed, 29 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 216f07c5f29f8..8901b2af147ef 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2784,6 +2784,25 @@ struct RegPairInfo {
} // end anonymous namespace
+void verify(SmallVectorImpl<RegPairInfo> &RegPairs) {
+ auto IsPPR = [](const RegPairInfo &c) { return c.Reg1 == RegPairInfo::PPR; };
+ auto PPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsPPR);
+ auto IsZPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::ZPR; };
+ auto ZPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsZPR);
+ assert(!(PPRBegin < ZPRBegin) &&
+ "Expected callee save predicate to be handled first");
+}
+
+unsigned findFreePredicateReg(BitVector &SavedRegs) {
+ for (unsigned PReg = AArch64::P8; PReg <= AArch64::P15; ++PReg) {
+ if (SavedRegs.test(PReg)) {
+ unsigned PNReg = PReg - AArch64::P0 + AArch64::PN0;
+ return PNReg;
+ }
+ }
+ return AArch64::NoRegister;
+}
+
static void computeCalleeSaveRegisterPairs(
MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
@@ -3076,6 +3095,9 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
unsigned PnReg = AFI->getPredicateRegForFillSpill();
assert(((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) && PnReg != 0) &&
"Expects SVE2.1 or SME2 target and a predicate register");
+#ifdef EXPENSIVE_CHECKS
+ verify(RegPairs);
+#endif
if (!PTrueCreated) {
PTrueCreated = true;
BuildMI(MBB, MI, DL, TII.get(AArch64::PTRUE_C_B), PnReg)
@@ -3232,6 +3254,10 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
unsigned PnReg = AFI->getPredicateRegForFillSpill();
assert(((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) && PnReg != 0) &&
"Expects SVE2.1 or SME2 target and a predicate register");
+#ifdef EXPENSIVE_CHECKS
+ assert(!(PPRBegin < ZPRBegin) &&
+ "Expected callee save predicate to be handled first");
+#endif
if (!PTrueCreated) {
PTrueCreated = true;
BuildMI(MBB, MBBI, DL, TII.get(AArch64::PTRUE_C_B), PnReg)
@@ -3364,12 +3390,9 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
// Find a suitable predicate register for the multi-vector spill/fill
// instructions.
- for (unsigned PReg = AArch64::P8; PReg <= AArch64::P15; ++PReg) {
- if (SavedRegs.test(PReg)) {
- AFI->setPredicateRegForFillSpill(PReg - AArch64::P0 + AArch64::PN0);
- break;
- }
- }
+ unsigned PnReg = findFreePredicateReg(SavedRegs);
+ if (PnReg != AArch64::NoRegister)
+ AFI->setPredicateRegForFillSpill(PnReg);
// If no free callee-save has been found assign one.
if (!AFI->getPredicateRegForFillSpill() &&
MF.getFunction().getCallingConv() ==
More information about the llvm-commits
mailing list