[llvm] 48af281 - Revert "[AArch64] Restore Z-registers before P-registers (#79623)"
Caroline Concatto via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 20 10:14:15 PST 2024
Author: Caroline Concatto
Date: 2024-02-20T18:13:33Z
New Revision: 48af281f7a5abe0b01daf7847d624d2a6b0ae9fa
URL: https://github.com/llvm/llvm-project/commit/48af281f7a5abe0b01daf7847d624d2a6b0ae9fa
DIFF: https://github.com/llvm/llvm-project/commit/48af281f7a5abe0b01daf7847d624d2a6b0ae9fa.diff
LOG: Revert "[AArch64] Restore Z-registers before P-registers (#79623)"
This reverts commit 3f0404aae7ed2f7138526e1bcd100a60dfe08227.
std::reverse is breaking some builds
Added:
Modified:
llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir
llvm/test/CodeGen/AArch64/framelayout-sve.mir
llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
llvm/test/CodeGen/AArch64/stack-probing-sve.ll
llvm/test/CodeGen/AArch64/sve-alloca.ll
llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
llvm/test/CodeGen/AArch64/sve-tailcall.ll
llvm/test/CodeGen/AArch64/unwind-preserved.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 0e9adde564b3e5..d98750e09d4e36 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -3189,6 +3189,11 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
return MIB->getIterator();
};
+ // SVE objects are always restored in reverse order.
+ for (const RegPairInfo &RPI : reverse(RegPairs))
+ if (RPI.isScalable())
+ EmitMI(RPI);
+
if (homogeneousPrologEpilog(MF, &MBB)) {
auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
.setMIFlag(MachineInstr::FrameDestroy);
@@ -3199,19 +3204,11 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
return true;
}
- // For performance reasons restore SVE register in increasing order
- auto IsPPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::PPR; };
- auto PPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsPPR);
- auto PPREnd = std::find_if(RegPairs.rbegin(), RegPairs.rend(), IsPPR);
- std::reverse(PPRBegin, PPREnd.base());
- auto IsZPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::ZPR; };
- auto ZPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsZPR);
- auto ZPREnd = std::find_if(RegPairs.rbegin(), RegPairs.rend(), IsZPR);
- std::reverse(ZPRBegin, ZPREnd.base());
-
if (ReverseCSRRestoreSeq) {
MachineBasicBlock::iterator First = MBB.end();
for (const RegPairInfo &RPI : reverse(RegPairs)) {
+ if (RPI.isScalable())
+ continue;
MachineBasicBlock::iterator It = EmitMI(RPI);
if (First == MBB.end())
First = It;
@@ -3220,6 +3217,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
MBB.splice(MBBI, &MBB, First);
} else {
for (const RegPairInfo &RPI : RegPairs) {
+ if (RPI.isScalable())
+ continue;
(void)EmitMI(RPI);
}
}
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir
index aed31450736191..3dba21d59b4087 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir
@@ -19,8 +19,8 @@
; CHECK-NEXT: // implicit-def: $p4
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
- ; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+ ; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: .cfi_def_cfa wsp, 16
; CHECK-NEXT: .cfi_restore z8
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
index f7920e595e44ba..213d7919e4a727 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
@@ -772,9 +772,9 @@ body: |
# CHECK: $sp = frame-destroy ADDXri $sp, 32, 0
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22
-# CHECK-NEXT: $z10 = frame-destroy LDR_ZXI $sp, 0
+# CHECK-NEXT: $z10 = frame-destroy LDR_ZXI $sp, 0
# CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 1
-# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2
+# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2
# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
@@ -873,14 +873,14 @@ body: |
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22
# CHECK: $sp = frame-destroy ADDVL_XXI $sp, 1
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22
-# CHECK: $z23 = frame-destroy LDR_ZXI $sp, 2
-# CHECK: $z22 = frame-destroy LDR_ZXI $sp, 3
-# CHECK: $z9 = frame-destroy LDR_ZXI $sp, 16
-# CHECK: $z8 = frame-destroy LDR_ZXI $sp, 17
# CHECK: $p15 = frame-destroy LDR_PXI $sp, 4
# CHECK: $p14 = frame-destroy LDR_PXI $sp, 5
# CHECK: $p5 = frame-destroy LDR_PXI $sp, 14
# CHECK: $p4 = frame-destroy LDR_PXI $sp, 15
+# CHECK: $z23 = frame-destroy LDR_ZXI $sp, 2
+# CHECK: $z22 = frame-destroy LDR_ZXI $sp, 3
+# CHECK: $z9 = frame-destroy LDR_ZXI $sp, 16
+# CHECK: $z8 = frame-destroy LDR_ZXI $sp, 17
# CHECK: $sp = frame-destroy ADDVL_XXI $sp, 18
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 32
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
@@ -1037,14 +1037,14 @@ body: |
# CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]]
# CHECK: $sp = frame-destroy ADDVL_XXI $fp, -18
-# CHECK: $z23 = frame-destroy LDR_ZXI $sp, 2
-# CHECK-NEXT: $z22 = frame-destroy LDR_ZXI $sp, 3
-# CHECK: $z9 = frame-destroy LDR_ZXI $sp, 16
-# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 17
# CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 4
# CHECK-NEXT: $p14 = frame-destroy LDR_PXI $sp, 5
# CHECK: $p5 = frame-destroy LDR_PXI $sp, 14
# CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 15
+# CHECK-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 2
+# CHECK-NEXT: $z22 = frame-destroy LDR_ZXI $sp, 3
+# CHECK: $z9 = frame-destroy LDR_ZXI $sp, 16
+# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 17
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z9
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z10
@@ -1198,10 +1198,10 @@ body: |
# CHECK: $sp = frame-destroy ADDVL_XXI $sp, 7
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22
-# CHECK-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 1
-# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2
# CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 6
# CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 7
+# CHECK-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 1
+# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2
# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index 6d2abf7e18419a..296f2be9cfee5e 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -226,30 +226,30 @@ define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale
; CHECK-NEXT: ldr z1, [sp] // 16-byte Folded Reload
; CHECK-NEXT: fadd z0.d, z1.d, z0.d
; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
@@ -318,30 +318,30 @@ define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x
; CHECK-NEXT: ldr p1, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
index de676ac5e0d2e6..86918a59f3810e 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
@@ -187,30 +187,30 @@ define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) nounwind {
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
@@ -267,30 +267,30 @@ define <vscale x 4 x i32> @smstart_clobber_sve_duplicate(<vscale x 4 x i32> %x)
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
index ea7808d73093e6..b7119fc0825673 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
@@ -129,6 +129,7 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -144,7 +145,6 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -284,6 +284,7 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -299,7 +300,6 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -440,6 +440,7 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -455,7 +456,6 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -595,6 +595,7 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -610,7 +611,6 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -751,6 +751,7 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -766,7 +767,6 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -906,6 +906,7 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -921,7 +922,6 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1062,6 +1062,7 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1077,7 +1078,6 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1217,6 +1217,7 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1232,7 +1233,6 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1380,6 +1380,7 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1394,7 +1395,6 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1545,6 +1545,7 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1559,7 +1560,6 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1711,6 +1711,7 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1725,7 +1726,6 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1877,6 +1877,7 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1891,7 +1892,6 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -2043,6 +2043,7 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -2057,7 +2058,6 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -2209,6 +2209,7 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -2223,7 +2224,6 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -2375,6 +2375,7 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -2389,7 +2390,6 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -2541,6 +2541,7 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -2555,7 +2556,6 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
index 7e2d28fbf79828..1fb251a4f628e9 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
@@ -82,6 +82,7 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -97,7 +98,6 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -190,6 +190,7 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -205,7 +206,6 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -299,6 +299,7 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -314,7 +315,6 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -407,6 +407,7 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -422,7 +423,6 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -516,6 +516,7 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -531,7 +532,6 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -624,6 +624,7 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -639,7 +640,6 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -733,6 +733,7 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -748,7 +749,6 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -841,6 +841,7 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -856,7 +857,6 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -955,6 +955,7 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -969,7 +970,6 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1071,6 +1071,7 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1085,7 +1086,6 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1188,6 +1188,7 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1202,7 +1203,6 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1304,6 +1304,7 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1318,7 +1319,6 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1421,6 +1421,7 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1435,7 +1436,6 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1537,6 +1537,7 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1551,7 +1552,6 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1654,6 +1654,7 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1668,7 +1669,6 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1770,6 +1770,7 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1784,7 +1785,6 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
index 56d865ef83e6bc..1ad78709d5012d 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
@@ -380,6 +380,7 @@ define void @sve_16v_1p_csr(<vscale x 4 x float> %a) #0 {
; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
; CHECK-NEXT: //APP
; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -396,7 +397,6 @@ define void @sve_16v_1p_csr(<vscale x 4 x float> %a) #0 {
; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #17
; CHECK-NEXT: .cfi_def_cfa wsp, 16
; CHECK-NEXT: .cfi_restore z8
@@ -697,10 +697,10 @@ define void @sve_unprobed_area(<vscale x 4 x float> %a, i32 %n) #0 {
; CHECK-NEXT: //NO_APP
; CHECK-NEXT: addvl sp, sp, #4
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
+; CHECK-NEXT: ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #4
; CHECK-NEXT: .cfi_def_cfa wsp, 16
; CHECK-NEXT: .cfi_restore z8
diff --git a/llvm/test/CodeGen/AArch64/sve-alloca.ll b/llvm/test/CodeGen/AArch64/sve-alloca.ll
index d227538043fceb..47e49b84aaaffb 100644
--- a/llvm/test/CodeGen/AArch64/sve-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-alloca.ll
@@ -66,30 +66,30 @@ define void @foo(<vscale x 4 x i64> %dst, i1 %cond) {
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: bl bar
; CHECK-NEXT: addvl sp, x29, #-18
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
index 3965af6a9066d6..9851583b950eba 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
@@ -567,30 +567,30 @@ define <vscale x 4 x float> @sve_caller_non_sve_callee_high_range(<vscale x 4 x
; CHECK-NEXT: bl non_sve_callee_high_range
; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
@@ -659,30 +659,30 @@ define <vscale x 4 x float> @sve_ret_caller_non_sve_callee_high_range() {
; CHECK-NEXT: fmov s7, #7.00000000
; CHECK-NEXT: bl non_sve_callee_high_range
; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-tailcall.ll b/llvm/test/CodeGen/AArch64/sve-tailcall.ll
index 4ddf007768fd2c..f32c80d392b633 100644
--- a/llvm/test/CodeGen/AArch64/sve-tailcall.ll
+++ b/llvm/test/CodeGen/AArch64/sve-tailcall.ll
@@ -83,30 +83,30 @@ define i32 @sve_caller_non_sve_callee(<vscale x 4 x i32> %arg) nounwind {
; CHECK-NEXT: //APP
; CHECK-NEXT: //NO_APP
; CHECK-NEXT: bl non_sve_callee
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
@@ -158,30 +158,30 @@ define i32 @sve_caller_non_sve_callee_fastcc(<vscale x 4 x i32> %arg) nounwind {
; CHECK-NEXT: //APP
; CHECK-NEXT: //NO_APP
; CHECK-NEXT: bl non_sve_callee
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/unwind-preserved.ll b/llvm/test/CodeGen/AArch64/unwind-preserved.ll
index 822be14faaeb1f..f3c4d217e6fcaa 100644
--- a/llvm/test/CodeGen/AArch64/unwind-preserved.ll
+++ b/llvm/test/CodeGen/AArch64/unwind-preserved.ll
@@ -63,6 +63,18 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -79,18 +91,6 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #18
; CHECK-NEXT: .cfi_def_cfa wsp, 16
; CHECK-NEXT: .cfi_restore z8
@@ -112,6 +112,18 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -128,18 +140,6 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #18
; CHECK-NEXT: .cfi_def_cfa wsp, 16
; CHECK-NEXT: .cfi_restore z8
@@ -215,6 +215,18 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
; GISEL-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
; GISEL-NEXT: addvl sp, sp, #2
; GISEL-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
+; GISEL-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; GISEL-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; GISEL-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
; GISEL-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -231,18 +243,6 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
; GISEL-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
; GISEL-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; GISEL-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; GISEL-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; GISEL-NEXT: addvl sp, sp, #18
; GISEL-NEXT: .cfi_def_cfa wsp, 16
; GISEL-NEXT: .cfi_restore z8
@@ -264,6 +264,18 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
; GISEL-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; GISEL-NEXT: addvl sp, sp, #2
; GISEL-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
+; GISEL-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; GISEL-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; GISEL-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
; GISEL-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -280,18 +292,6 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
; GISEL-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
; GISEL-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; GISEL-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; GISEL-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; GISEL-NEXT: addvl sp, sp, #18
; GISEL-NEXT: .cfi_def_cfa wsp, 16
; GISEL-NEXT: .cfi_restore z8
More information about the llvm-commits
mailing list