[llvm] 7f0c5b0 - [AArch64]Fix invalid use of ld1/st1 in stack alloc (#105518)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 5 06:47:13 PDT 2024
Author: Lukacma
Date: 2024-09-05T14:47:10+01:00
New Revision: 7f0c5b0502b462d2afad32d3681b37cfc15ba844
URL: https://github.com/llvm/llvm-project/commit/7f0c5b0502b462d2afad32d3681b37cfc15ba844
DIFF: https://github.com/llvm/llvm-project/commit/7f0c5b0502b462d2afad32d3681b37cfc15ba844.diff
LOG: [AArch64]Fix invalid use of ld1/st1 in stack alloc (#105518)
This patch fixes incorrect usage of scalar+immediate variant of ld1/st1
instructions during stack allocation caused by
[c4bac7f](https://github.com/llvm/llvm-project/commit/c4bac7f7dcd931a5e561604e95656a24c3d1c9d9).
This commit used ld1/st1 even when stack offset was outside of immediate
range for this instruction, producing invalid assembly. This commit was also using incorrect offsets when using ld1/st1.
Added:
Modified:
llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index ad20e76d0fe2e0..7e041b086599b9 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -3020,6 +3020,7 @@ static void computeCalleeSaveRegisterPairs(
ByteOffset += StackFillDir * StackHazardSize;
LastReg = RPI.Reg1;
+ int Scale = RPI.getScale();
// Add the next reg to the pair if it is in the same register class.
if (unsigned(i + RegInc) < Count && !AFI->hasStackHazardSlotIndex()) {
Register NextReg = CSI[i + RegInc].getReg();
@@ -3045,9 +3046,14 @@ static void computeCalleeSaveRegisterPairs(
case RegPairInfo::PPR:
break;
case RegPairInfo::ZPR:
- if (AFI->getPredicateRegForFillSpill() != 0)
- if (((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1))
+ if (AFI->getPredicateRegForFillSpill() != 0 &&
+ ((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1)) {
+ // Calculate offset of register pair to see if pair instruction can be
+ // used.
+ int Offset = (ScalableByteOffset + StackFillDir * 2 * Scale) / Scale;
+ if ((-16 <= Offset && Offset <= 14) && (Offset % 2 == 0))
RPI.Reg2 = NextReg;
+ }
break;
case RegPairInfo::VG:
break;
@@ -3087,7 +3093,6 @@ static void computeCalleeSaveRegisterPairs(
if (NeedsWinCFI &&
RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair
RPI.FrameIdx = CSI[i + RegInc].getFrameIdx();
- int Scale = RPI.getScale();
int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
assert(OffsetPre % Scale == 0);
@@ -3356,8 +3361,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
MachineMemOperand::MOStore, Size, Alignment));
MIB.addReg(PnReg);
MIB.addReg(AArch64::SP)
- .addImm(RPI.Offset) // [sp, #offset*scale],
- // where factor*scale is implicit
+ .addImm(RPI.Offset / 2) // [sp, #imm*2*vscale],
+ // where 2*vscale is implicit
.setMIFlag(MachineInstr::FrameSetup);
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
@@ -3378,8 +3383,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
}
MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
.addReg(AArch64::SP)
- .addImm(RPI.Offset) // [sp, #offset*scale],
- // where factor*scale is implicit
+ .addImm(RPI.Offset) // [sp, #offset*vscale],
+ // where factor*vscale is implicit
.setMIFlag(MachineInstr::FrameSetup);
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
@@ -3523,8 +3528,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
MachineMemOperand::MOLoad, Size, Alignment));
MIB.addReg(PnReg);
MIB.addReg(AArch64::SP)
- .addImm(RPI.Offset) // [sp, #offset*scale]
- // where factor*scale is implicit
+ .addImm(RPI.Offset / 2) // [sp, #imm*2*vscale]
+ // where 2*vscale is implicit
.setMIFlag(MachineInstr::FrameDestroy);
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
@@ -3541,8 +3546,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
}
MIB.addReg(Reg1, getDefRegState(true));
MIB.addReg(AArch64::SP)
- .addImm(RPI.Offset) // [sp, #offset*scale]
- // where factor*scale is implicit
+ .addImm(RPI.Offset) // [sp, #offset*vscale]
+ // where factor*vscale is implicit
.setMIFlag(MachineInstr::FrameDestroy);
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
index a96f9e382ed1a8..8724e7c1c368d9 100644
--- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
+++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
@@ -332,16 +332,16 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
-; CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
-; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
-; CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
@@ -349,7 +349,8 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 32 - 8 * VG
; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 32 - 16 * VG
; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 32 - 24 * VG
@@ -372,15 +373,16 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 144 * VG
; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -427,16 +429,16 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; FP-CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: ptrue pn8.b
; FP-CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
@@ -444,7 +446,8 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; FP-CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 48 - 8 * VG
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 48 - 16 * VG
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 48 - 24 * VG
@@ -465,15 +468,16 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; FP-CHECK-NEXT: .cfi_restore vg
; FP-CHECK-NEXT: addvl sp, sp, #1
; FP-CHECK-NEXT: ptrue pn8.b
+; FP-CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
; FP-CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
; FP-CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; FP-CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; FP-CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
index 29d3d68fc4c3de..c63899cf7d2575 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
@@ -55,31 +55,45 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -89,14 +103,20 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -109,15 +129,21 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -184,31 +210,45 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0, x1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -218,14 +258,20 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -238,15 +284,21 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -314,31 +366,45 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -348,14 +414,20 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -368,15 +440,21 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -443,31 +521,45 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0, x1, lsl #1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -477,14 +569,20 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -497,15 +595,21 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -573,31 +677,45 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1w { z0.s, z8.s }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -607,14 +725,20 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -627,15 +751,21 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -702,31 +832,45 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1w { z0.s, z8.s }, pn8/z, [x0, x1, lsl #2]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -736,14 +880,20 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -756,15 +906,21 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -832,31 +988,45 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1d { z0.d, z8.d }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -866,14 +1036,20 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -886,15 +1062,21 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -961,31 +1143,45 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1d { z0.d, z8.d }, pn8/z, [x0, x1, lsl #3]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -995,14 +1191,20 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1015,15 +1217,21 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1093,32 +1301,46 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1130,13 +1352,18 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1154,14 +1381,19 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1234,32 +1466,46 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0, x1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1271,13 +1517,18 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1295,14 +1546,19 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1376,32 +1632,46 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1413,13 +1683,18 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1437,14 +1712,19 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1518,32 +1798,46 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0, x1, lsl #1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1555,13 +1849,18 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1579,14 +1878,19 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1660,32 +1964,46 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1697,13 +2015,18 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1721,14 +2044,19 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1802,32 +2130,46 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0, x1, lsl #2]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1839,13 +2181,18 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1863,14 +2210,19 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1944,32 +2296,46 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1981,13 +2347,18 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -2005,14 +2376,19 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2086,32 +2462,46 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0, x1, lsl #3]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2123,13 +2513,18 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -2147,14 +2542,19 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
index 3d3748e1011228..05241f788d3ead 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
@@ -8,31 +8,45 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1b { z0.b, z8.b }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -42,14 +56,20 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -62,15 +82,21 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -90,31 +116,45 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1b { z0.b, z8.b }, pn8/z, [x0, x1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -124,14 +164,20 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -144,15 +190,21 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -173,31 +225,45 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1h { z0.h, z8.h }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -207,14 +273,20 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -227,15 +299,21 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -255,31 +333,45 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1h { z0.h, z8.h }, pn8/z, [x0, x1, lsl #1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -289,14 +381,20 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -309,15 +407,21 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -338,31 +442,45 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1w { z0.s, z8.s }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -372,14 +490,20 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -392,15 +516,21 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -420,31 +550,45 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1w { z0.s, z8.s }, pn8/z, [x0, x1, lsl #2]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -454,14 +598,20 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -474,15 +624,21 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -503,31 +659,45 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1d { z0.d, z8.d }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -537,14 +707,20 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -557,15 +733,21 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -585,31 +767,45 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1d { z0.d, z8.d }, pn8/z, [x0, x1, lsl #3]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: mov z1.d, z8.d
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -619,14 +815,20 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -639,15 +841,21 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -668,32 +876,46 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -705,13 +927,18 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -729,14 +956,19 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -760,32 +992,46 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0, x1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -797,13 +1043,18 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -821,14 +1072,19 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -853,32 +1109,46 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -890,13 +1160,18 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -914,14 +1189,19 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -945,32 +1225,46 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0, x1, lsl #1]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -982,13 +1276,18 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1006,14 +1305,19 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1038,32 +1342,46 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1075,13 +1393,18 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1099,14 +1422,19 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1130,32 +1458,46 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0, x1, lsl #2]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1167,13 +1509,18 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1191,14 +1538,19 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1223,32 +1575,46 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1260,13 +1626,18 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1284,14 +1655,19 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1315,32 +1691,46 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ldnt1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0, x1, lsl #3]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
-; STRIDED-NEXT: ptrue pn8.b
-; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z2.d, z8.d
; STRIDED-NEXT: mov z3.d, z12.d
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z4.d
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1352,13 +1742,18 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
; CONTIGUOUS-NEXT: addvl sp, sp, #-15
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-4
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -1376,14 +1771,19 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
index 470c0dd45782c9..c9d216935edbf3 100644
--- a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
+++ b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll
@@ -91,16 +91,16 @@ define void @fbyte(<vscale x 16 x i8> %v) {
; PAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: ptrue pn8.b
; PAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
-; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
-; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
-; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
@@ -108,7 +108,8 @@ define void @fbyte(<vscale x 16 x i8> %v) {
; PAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
; PAIR-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
; PAIR-NEXT: .cfi_offset w30, -8
; PAIR-NEXT: .cfi_offset w29, -16
@@ -122,15 +123,16 @@ define void @fbyte(<vscale x 16 x i8> %v) {
; PAIR-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
; PAIR-NEXT: bl my_func
; PAIR-NEXT: ptrue pn8.b
+; PAIR-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
; PAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
; PAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; PAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; PAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -233,16 +235,16 @@ define void @fhalf(<vscale x 8 x half> %v) {
; PAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: ptrue pn8.b
; PAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
-; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
-; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
-; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
@@ -250,7 +252,8 @@ define void @fhalf(<vscale x 8 x half> %v) {
; PAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
; PAIR-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
; PAIR-NEXT: .cfi_offset w30, -8
; PAIR-NEXT: .cfi_offset w29, -16
@@ -264,15 +267,16 @@ define void @fhalf(<vscale x 8 x half> %v) {
; PAIR-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
; PAIR-NEXT: bl my_func
; PAIR-NEXT: ptrue pn8.b
+; PAIR-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
; PAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
-; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
; PAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; PAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; PAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -328,7 +332,7 @@ define aarch64_sve_vector_pcs void @test_clobbers_z_p_regs() {
; PAIR-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill
-; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
; PAIR-NEXT: .cfi_offset w29, -16
; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
@@ -338,7 +342,7 @@ define aarch64_sve_vector_pcs void @test_clobbers_z_p_regs() {
; PAIR-NEXT: //NO_APP
; PAIR-NEXT: ptrue pn8.b
; PAIR-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
-; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; PAIR-NEXT: ldr p8, [sp, #5, mul vl] // 2-byte Folded Reload
; PAIR-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
; PAIR-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -385,7 +389,7 @@ define aarch64_sve_vector_pcs void @test_clobbers_z_p_regs2() {
; PAIR-NEXT: ptrue pn9.b
; PAIR-NEXT: str p10, [sp, #6, mul vl] // 2-byte Folded Spill
; PAIR-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill
-; PAIR-NEXT: st1b { z8.b, z9.b }, pn9, [sp, #4, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: st1b { z8.b, z9.b }, pn9, [sp, #2, mul vl] // 32-byte Folded Spill
; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
; PAIR-NEXT: .cfi_offset w29, -16
; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
@@ -396,7 +400,7 @@ define aarch64_sve_vector_pcs void @test_clobbers_z_p_regs2() {
; PAIR-NEXT: ptrue pn9.b
; PAIR-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
; PAIR-NEXT: ldr p10, [sp, #6, mul vl] // 2-byte Folded Reload
-; PAIR-NEXT: ld1b { z8.b, z9.b }, pn9/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; PAIR-NEXT: ld1b { z8.b, z9.b }, pn9/z, [sp, #2, mul vl] // 32-byte Folded Reload
; PAIR-NEXT: ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload
; PAIR-NEXT: addvl sp, sp, #4
; PAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -431,17 +435,17 @@ define aarch64_sve_vector_pcs void @test_clobbers_z_regs() {
; PAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; PAIR-NEXT: addvl sp, sp, #-3
; PAIR-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; PAIR-NEXT: ptrue pn8.b
-; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; PAIR-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill
; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
; PAIR-NEXT: .cfi_offset w29, -16
; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
; PAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
; PAIR-NEXT: //APP
; PAIR-NEXT: //NO_APP
-; PAIR-NEXT: ptrue pn8.b
-; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
; PAIR-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; PAIR-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; PAIR-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; PAIR-NEXT: addvl sp, sp, #3
; PAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; PAIR-NEXT: ret
More information about the llvm-commits
mailing list