[Mlir-commits] [llvm] [mlir] [DO NOT SUBMIT] Implement LowerVectorToArmNeon Pattern (PR #81895)
Kojo Acquah
llvmlistbot at llvm.org
Fri Feb 23 12:47:04 PST 2024
https://github.com/KoolJBlack updated https://github.com/llvm/llvm-project/pull/81895
>From 3c25a9de91e9c76ffba82939b19eafe3e60d51f7 Mon Sep 17 00:00:00 2001
From: Stella Laurenzo <stellaraccident at gmail.com>
Date: Tue, 20 Feb 2024 10:29:40 -0800
Subject: [PATCH 1/2] Revert "[AArch64] Restore Z-registers before P-registers
(#79623)"
This reverts commit 3f0404aae7ed2f7138526e1bcd100a60dfe08227.
---
.../Target/AArch64/AArch64FrameLowering.cpp | 19 ++--
.../framelayout-sve-calleesaves-fix.mir | 2 +-
llvm/test/CodeGen/AArch64/framelayout-sve.mir | 24 ++---
.../sme-streaming-compatible-interface.ll | 32 +++----
.../AArch64/sme-streaming-interface.ll | 32 +++----
.../CodeGen/AArch64/sme2-intrinsics-ld1.ll | 32 +++----
.../CodeGen/AArch64/sme2-intrinsics-ldnt1.ll | 32 +++----
.../test/CodeGen/AArch64/stack-probing-sve.ll | 4 +-
llvm/test/CodeGen/AArch64/sve-alloca.ll | 16 ++--
.../AArch64/sve-calling-convention-mixed.ll | 32 +++----
llvm/test/CodeGen/AArch64/sve-tailcall.ll | 32 +++----
llvm/test/CodeGen/AArch64/unwind-preserved.ll | 96 +++++++++----------
12 files changed, 176 insertions(+), 177 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 0e9adde564b3e5..d98750e09d4e36 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -3189,6 +3189,11 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
return MIB->getIterator();
};
+ // SVE objects are always restored in reverse order.
+ for (const RegPairInfo &RPI : reverse(RegPairs))
+ if (RPI.isScalable())
+ EmitMI(RPI);
+
if (homogeneousPrologEpilog(MF, &MBB)) {
auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
.setMIFlag(MachineInstr::FrameDestroy);
@@ -3199,19 +3204,11 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
return true;
}
- // For performance reasons restore SVE register in increasing order
- auto IsPPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::PPR; };
- auto PPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsPPR);
- auto PPREnd = std::find_if(RegPairs.rbegin(), RegPairs.rend(), IsPPR);
- std::reverse(PPRBegin, PPREnd.base());
- auto IsZPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::ZPR; };
- auto ZPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsZPR);
- auto ZPREnd = std::find_if(RegPairs.rbegin(), RegPairs.rend(), IsZPR);
- std::reverse(ZPRBegin, ZPREnd.base());
-
if (ReverseCSRRestoreSeq) {
MachineBasicBlock::iterator First = MBB.end();
for (const RegPairInfo &RPI : reverse(RegPairs)) {
+ if (RPI.isScalable())
+ continue;
MachineBasicBlock::iterator It = EmitMI(RPI);
if (First == MBB.end())
First = It;
@@ -3220,6 +3217,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
MBB.splice(MBBI, &MBB, First);
} else {
for (const RegPairInfo &RPI : RegPairs) {
+ if (RPI.isScalable())
+ continue;
(void)EmitMI(RPI);
}
}
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir
index aed31450736191..3dba21d59b4087 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir
@@ -19,8 +19,8 @@
; CHECK-NEXT: // implicit-def: $p4
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
- ; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+ ; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: .cfi_def_cfa wsp, 16
; CHECK-NEXT: .cfi_restore z8
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
index f7920e595e44ba..213d7919e4a727 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
@@ -772,9 +772,9 @@ body: |
# CHECK: $sp = frame-destroy ADDXri $sp, 32, 0
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22
-# CHECK-NEXT: $z10 = frame-destroy LDR_ZXI $sp, 0
+# CHECK-NEXT: $z10 = frame-destroy LDR_ZXI $sp, 0
# CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 1
-# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2
+# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2
# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
@@ -873,14 +873,14 @@ body: |
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22
# CHECK: $sp = frame-destroy ADDVL_XXI $sp, 1
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22
-# CHECK: $z23 = frame-destroy LDR_ZXI $sp, 2
-# CHECK: $z22 = frame-destroy LDR_ZXI $sp, 3
-# CHECK: $z9 = frame-destroy LDR_ZXI $sp, 16
-# CHECK: $z8 = frame-destroy LDR_ZXI $sp, 17
# CHECK: $p15 = frame-destroy LDR_PXI $sp, 4
# CHECK: $p14 = frame-destroy LDR_PXI $sp, 5
# CHECK: $p5 = frame-destroy LDR_PXI $sp, 14
# CHECK: $p4 = frame-destroy LDR_PXI $sp, 15
+# CHECK: $z23 = frame-destroy LDR_ZXI $sp, 2
+# CHECK: $z22 = frame-destroy LDR_ZXI $sp, 3
+# CHECK: $z9 = frame-destroy LDR_ZXI $sp, 16
+# CHECK: $z8 = frame-destroy LDR_ZXI $sp, 17
# CHECK: $sp = frame-destroy ADDVL_XXI $sp, 18
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 32
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
@@ -1037,14 +1037,14 @@ body: |
# CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]]
# CHECK: $sp = frame-destroy ADDVL_XXI $fp, -18
-# CHECK: $z23 = frame-destroy LDR_ZXI $sp, 2
-# CHECK-NEXT: $z22 = frame-destroy LDR_ZXI $sp, 3
-# CHECK: $z9 = frame-destroy LDR_ZXI $sp, 16
-# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 17
# CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 4
# CHECK-NEXT: $p14 = frame-destroy LDR_PXI $sp, 5
# CHECK: $p5 = frame-destroy LDR_PXI $sp, 14
# CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 15
+# CHECK-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 2
+# CHECK-NEXT: $z22 = frame-destroy LDR_ZXI $sp, 3
+# CHECK: $z9 = frame-destroy LDR_ZXI $sp, 16
+# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 17
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z9
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z10
@@ -1198,10 +1198,10 @@ body: |
# CHECK: $sp = frame-destroy ADDVL_XXI $sp, 7
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22
-# CHECK-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 1
-# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2
# CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 6
# CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 7
+# CHECK-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 1
+# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2
# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index 6d2abf7e18419a..296f2be9cfee5e 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -226,30 +226,30 @@ define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale
; CHECK-NEXT: ldr z1, [sp] // 16-byte Folded Reload
; CHECK-NEXT: fadd z0.d, z1.d, z0.d
; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
@@ -318,30 +318,30 @@ define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x
; CHECK-NEXT: ldr p1, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
index de676ac5e0d2e6..86918a59f3810e 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
@@ -187,30 +187,30 @@ define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) nounwind {
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
@@ -267,30 +267,30 @@ define <vscale x 4 x i32> @smstart_clobber_sve_duplicate(<vscale x 4 x i32> %x)
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
index ea7808d73093e6..b7119fc0825673 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
@@ -129,6 +129,7 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -144,7 +145,6 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -284,6 +284,7 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -299,7 +300,6 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -440,6 +440,7 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -455,7 +456,6 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -595,6 +595,7 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -610,7 +611,6 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -751,6 +751,7 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -766,7 +767,6 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -906,6 +906,7 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -921,7 +922,6 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1062,6 +1062,7 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1077,7 +1078,6 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1217,6 +1217,7 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1232,7 +1233,6 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1380,6 +1380,7 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1394,7 +1395,6 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1545,6 +1545,7 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1559,7 +1560,6 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1711,6 +1711,7 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1725,7 +1726,6 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1877,6 +1877,7 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1891,7 +1892,6 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -2043,6 +2043,7 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -2057,7 +2058,6 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -2209,6 +2209,7 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -2223,7 +2224,6 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -2375,6 +2375,7 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -2389,7 +2390,6 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -2541,6 +2541,7 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -2555,7 +2556,6 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
index 7e2d28fbf79828..1fb251a4f628e9 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
@@ -82,6 +82,7 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -97,7 +98,6 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -190,6 +190,7 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -205,7 +206,6 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -299,6 +299,7 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -314,7 +315,6 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -407,6 +407,7 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -422,7 +423,6 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -516,6 +516,7 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -531,7 +532,6 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -624,6 +624,7 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -639,7 +640,6 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -733,6 +733,7 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -748,7 +749,6 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -841,6 +841,7 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -856,7 +857,6 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -955,6 +955,7 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -969,7 +970,6 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1071,6 +1071,7 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1085,7 +1086,6 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1188,6 +1188,7 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1202,7 +1203,6 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1304,6 +1304,7 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1318,7 +1319,6 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1421,6 +1421,7 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1435,7 +1436,6 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1537,6 +1537,7 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1551,7 +1552,6 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1654,6 +1654,7 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1668,7 +1669,6 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1770,6 +1770,7 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1784,7 +1785,6 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
index 56d865ef83e6bc..1ad78709d5012d 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
@@ -380,6 +380,7 @@ define void @sve_16v_1p_csr(<vscale x 4 x float> %a) #0 {
; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
; CHECK-NEXT: //APP
; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -396,7 +397,6 @@ define void @sve_16v_1p_csr(<vscale x 4 x float> %a) #0 {
; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #17
; CHECK-NEXT: .cfi_def_cfa wsp, 16
; CHECK-NEXT: .cfi_restore z8
@@ -697,10 +697,10 @@ define void @sve_unprobed_area(<vscale x 4 x float> %a, i32 %n) #0 {
; CHECK-NEXT: //NO_APP
; CHECK-NEXT: addvl sp, sp, #4
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
+; CHECK-NEXT: ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #4
; CHECK-NEXT: .cfi_def_cfa wsp, 16
; CHECK-NEXT: .cfi_restore z8
diff --git a/llvm/test/CodeGen/AArch64/sve-alloca.ll b/llvm/test/CodeGen/AArch64/sve-alloca.ll
index d227538043fceb..47e49b84aaaffb 100644
--- a/llvm/test/CodeGen/AArch64/sve-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-alloca.ll
@@ -66,30 +66,30 @@ define void @foo(<vscale x 4 x i64> %dst, i1 %cond) {
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: bl bar
; CHECK-NEXT: addvl sp, x29, #-18
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
index 3965af6a9066d6..9851583b950eba 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
@@ -567,30 +567,30 @@ define <vscale x 4 x float> @sve_caller_non_sve_callee_high_range(<vscale x 4 x
; CHECK-NEXT: bl non_sve_callee_high_range
; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
@@ -659,30 +659,30 @@ define <vscale x 4 x float> @sve_ret_caller_non_sve_callee_high_range() {
; CHECK-NEXT: fmov s7, #7.00000000
; CHECK-NEXT: bl non_sve_callee_high_range
; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-tailcall.ll b/llvm/test/CodeGen/AArch64/sve-tailcall.ll
index 4ddf007768fd2c..f32c80d392b633 100644
--- a/llvm/test/CodeGen/AArch64/sve-tailcall.ll
+++ b/llvm/test/CodeGen/AArch64/sve-tailcall.ll
@@ -83,30 +83,30 @@ define i32 @sve_caller_non_sve_callee(<vscale x 4 x i32> %arg) nounwind {
; CHECK-NEXT: //APP
; CHECK-NEXT: //NO_APP
; CHECK-NEXT: bl non_sve_callee
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
@@ -158,30 +158,30 @@ define i32 @sve_caller_non_sve_callee_fastcc(<vscale x 4 x i32> %arg) nounwind {
; CHECK-NEXT: //APP
; CHECK-NEXT: //NO_APP
; CHECK-NEXT: bl non_sve_callee
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/unwind-preserved.ll b/llvm/test/CodeGen/AArch64/unwind-preserved.ll
index 822be14faaeb1f..f3c4d217e6fcaa 100644
--- a/llvm/test/CodeGen/AArch64/unwind-preserved.ll
+++ b/llvm/test/CodeGen/AArch64/unwind-preserved.ll
@@ -63,6 +63,18 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -79,18 +91,6 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #18
; CHECK-NEXT: .cfi_def_cfa wsp, 16
; CHECK-NEXT: .cfi_restore z8
@@ -112,6 +112,18 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -128,18 +140,6 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #18
; CHECK-NEXT: .cfi_def_cfa wsp, 16
; CHECK-NEXT: .cfi_restore z8
@@ -215,6 +215,18 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
; GISEL-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
; GISEL-NEXT: addvl sp, sp, #2
; GISEL-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
+; GISEL-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; GISEL-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; GISEL-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
; GISEL-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -231,18 +243,6 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
; GISEL-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
; GISEL-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; GISEL-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; GISEL-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; GISEL-NEXT: addvl sp, sp, #18
; GISEL-NEXT: .cfi_def_cfa wsp, 16
; GISEL-NEXT: .cfi_restore z8
@@ -264,6 +264,18 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
; GISEL-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; GISEL-NEXT: addvl sp, sp, #2
; GISEL-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
+; GISEL-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; GISEL-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; GISEL-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
; GISEL-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -280,18 +292,6 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
; GISEL-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
; GISEL-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; GISEL-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; GISEL-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; GISEL-NEXT: addvl sp, sp, #18
; GISEL-NEXT: .cfi_def_cfa wsp, 16
; GISEL-NEXT: .cfi_restore z8
>From 13dc6ef2d016ced5057f9b5d3ab294571f7a5c06 Mon Sep 17 00:00:00 2001
From: Kojo Acquah <kooljblack at google.com>
Date: Thu, 15 Feb 2024 17:59:46 +0000
Subject: [PATCH 2/2] Implement LowerVectorToArmNeon
---
.../include/mlir/Dialect/ArmNeon/Transforms.h | 21 +++
.../Conversion/VectorToLLVM/CMakeLists.txt | 1 +
mlir/lib/Dialect/ArmNeon/CMakeLists.txt | 15 +-
mlir/lib/Dialect/ArmNeon/IR/CMakeLists.txt | 13 ++
.../Dialect/ArmNeon/Transforms/CMakeLists.txt | 14 ++
.../Transforms/LowerVectorToArmNeon.cpp | 154 ++++++++++++++++++
.../llvm-project-overlay/mlir/BUILD.bazel | 21 +++
7 files changed, 226 insertions(+), 13 deletions(-)
create mode 100644 mlir/include/mlir/Dialect/ArmNeon/Transforms.h
create mode 100644 mlir/lib/Dialect/ArmNeon/IR/CMakeLists.txt
create mode 100644 mlir/lib/Dialect/ArmNeon/Transforms/CMakeLists.txt
create mode 100644 mlir/lib/Dialect/ArmNeon/Transforms/LowerVectorToArmNeon.cpp
diff --git a/mlir/include/mlir/Dialect/ArmNeon/Transforms.h b/mlir/include/mlir/Dialect/ArmNeon/Transforms.h
new file mode 100644
index 00000000000000..41dbc2633d52c6
--- /dev/null
+++ b/mlir/include/mlir/Dialect/ArmNeon/Transforms.h
@@ -0,0 +1,21 @@
+//===- Transforms.h - ArmNeon Dialect Transformation Entrypoints -*- C++
+//-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_ARMNEON_TRANSFORMS_H
+#define MLIR_DIALECT_ARMNEON_TRANSFORMS_H
+
+namespace mlir {
+
+namespace arm_neon {
+void populateLowerVectorToArmNeonPatterns(RewritePatternSet &patterns);
+} // namespace arm_neon
+
+} // namespace mlir
+
+#endif // MLIR_DIALECT_ARMNEON_TRANSFORMS_H
diff --git a/mlir/lib/Conversion/VectorToLLVM/CMakeLists.txt b/mlir/lib/Conversion/VectorToLLVM/CMakeLists.txt
index 5fbb50f62395ec..a0fce139f27466 100644
--- a/mlir/lib/Conversion/VectorToLLVM/CMakeLists.txt
+++ b/mlir/lib/Conversion/VectorToLLVM/CMakeLists.txt
@@ -34,6 +34,7 @@ add_mlir_conversion_library(MLIRVectorToLLVMPass
MLIRVectorToLLVM
MLIRArmNeonDialect
+ MLIRArmNeonTransforms
MLIRArmSMEDialect
MLIRArmSMETransforms
MLIRArmSVEDialect
diff --git a/mlir/lib/Dialect/ArmNeon/CMakeLists.txt b/mlir/lib/Dialect/ArmNeon/CMakeLists.txt
index 060b6df1b334ad..9f57627c321fb0 100644
--- a/mlir/lib/Dialect/ArmNeon/CMakeLists.txt
+++ b/mlir/lib/Dialect/ArmNeon/CMakeLists.txt
@@ -1,13 +1,2 @@
-add_mlir_dialect_library(MLIRArmNeonDialect
- IR/ArmNeonDialect.cpp
-
- ADDITIONAL_HEADER_DIRS
- ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/ArmNeon
-
- DEPENDS
- MLIRArmNeonIncGen
-
- LINK_LIBS PUBLIC
- MLIRIR
- MLIRSideEffectInterfaces
- )
+add_subdirectory(IR)
+add_subdirectory(Transforms)
diff --git a/mlir/lib/Dialect/ArmNeon/IR/CMakeLists.txt b/mlir/lib/Dialect/ArmNeon/IR/CMakeLists.txt
new file mode 100644
index 00000000000000..b04919a3a31858
--- /dev/null
+++ b/mlir/lib/Dialect/ArmNeon/IR/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_mlir_dialect_library(MLIRArmNeonDialect
+ ArmNeonDialect.cpp
+
+ ADDITIONAL_HEADER_DIRS
+ ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/ArmNeon
+
+ DEPENDS
+ MLIRArmNeonIncGen
+
+ LINK_LIBS PUBLIC
+ MLIRIR
+ MLIRSideEffectInterfaces
+ )
diff --git a/mlir/lib/Dialect/ArmNeon/Transforms/CMakeLists.txt b/mlir/lib/Dialect/ArmNeon/Transforms/CMakeLists.txt
new file mode 100644
index 00000000000000..dcd806e981479d
--- /dev/null
+++ b/mlir/lib/Dialect/ArmNeon/Transforms/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_mlir_dialect_library(MLIRArmNeonTransforms
+ LowerVectorToArmNeon.cpp
+
+ DEPENDS
+ MLIRArmNeonIncGen
+
+ LINK_LIBS PUBLIC
+ MLIRArmNeonDialect
+ MLIRFuncDialect
+ MLIRVectorDialect
+ MLIRIR
+ MLIRLLVMCommonConversion
+ MLIRLLVMDialect
+ )
diff --git a/mlir/lib/Dialect/ArmNeon/Transforms/LowerVectorToArmNeon.cpp b/mlir/lib/Dialect/ArmNeon/Transforms/LowerVectorToArmNeon.cpp
new file mode 100644
index 00000000000000..d80aebab1139fe
--- /dev/null
+++ b/mlir/lib/Dialect/ArmNeon/Transforms/LowerVectorToArmNeon.cpp
@@ -0,0 +1,154 @@
+//===- LowerVectorToArmNeon.cpp - Lower 'arm_neon.intr.smmla' ops
+//-----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements lowering patterns from vector.contract to
+// arm_neon.intr.smmla
+//
+//===---
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/ArmNeon/ArmNeonDialect.h"
+#include "mlir/Dialect/ArmNeon/Transforms.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+#define DEBUG_TYPE "arm-neon-vector-lowering"
+
+using namespace mlir;
+using namespace mlir::arm_neon;
+
+namespace {
+
+// Return the shaped type with new element type.
+static Type matchContainerType(Type element, Type container) {
+ if (auto shapedTy = dyn_cast<ShapedType>(container))
+ return shapedTy.clone(element);
+
+ return element;
+}
+
+// Lowering from vector::contractOp directly to the arm neon
+// intrinsic.
+class LowerVectorToArmNeonPattern
+ : public OpRewritePattern<vector::ContractionOp> {
+public:
+ using OpRewritePattern::OpRewritePattern;
+ LogicalResult matchAndRewrite(vector::ContractionOp op,
+ PatternRewriter &rewriter) const override {
+ Location loc = op.getLoc();
+ Value lhs = op.getLhs();
+ Value rhs = op.getRhs();
+ Value res = op.getAcc();
+
+ // Check index maps represent M N K and aren't transposed.
+ auto indexingMaps = op.getIndexingMapsArray();
+ if (llvm::any_of(indexingMaps, [](mlir::AffineMap affineMap) {
+ return affineMap.isPermutation() || affineMap.getNumDims() != 3 ||
+ affineMap.getNumResults() != 2;
+ })) {
+ llvm::dbgs() << "The affine check failed! \n";
+ return failure();
+ }
+
+ // Check iterator types for contract
+ auto iteratorTypes = op.getIteratorTypesArray();
+ if (iteratorTypes.size() != 3 ||
+ iteratorTypes[0] != vector::IteratorType::parallel ||
+ iteratorTypes[1] != vector::IteratorType::parallel ||
+ iteratorTypes[2] != vector::IteratorType::reduction) {
+ return failure();
+ }
+
+ // Check the tile size by mapping the dimensions of the contract
+ // -- Tile size: [2, 2, 8]
+ // Infer tile sizes from operands. Check required tile size
+ // Note: RHS is not transposed
+ mlir::VectorType lhsType = op.getLhsType();
+ mlir::VectorType rhsType = op.getRhsType();
+ auto dimM = lhsType.getDimSize(0);
+ auto dimN = rhsType.getDimSize(0);
+ auto dimK = lhsType.getDimSize(1);
+ if (rhsType.getDimSize(1) != dimK || dimM != 2 || dimN != 2 || dimK != 8) {
+ return failure();
+ }
+
+ // Check two extsi inputs Rhs Lhs
+ arith::ExtSIOp origLhsExtOp;
+ arith::ExtSIOp origRhsExtOp;
+ if (!(origLhsExtOp =
+ dyn_cast_or_null<arith::ExtSIOp>(lhs.getDefiningOp())) ||
+ !(origRhsExtOp =
+ dyn_cast_or_null<arith::ExtSIOp>(rhs.getDefiningOp()))) {
+ return failure();
+ }
+
+ arith::ExtSIOp extsiLhs;
+ arith::ExtSIOp extsiRhs;
+ // Match any iX to i32 for X<8 then turn into an i8 output. Feed into
+ // following neon instruction. Check inputs for extsi are <=i8
+ if (auto lhsExtInType =
+ origLhsExtOp.getIn().getType().dyn_cast<mlir::VectorType>()) {
+ if (lhsExtInType.getElementTypeBitWidth() <= 8) {
+ // Target lhs type with i8. This is likely redundant
+ Type targetLhsExtTy =
+ matchContainerType(rewriter.getI8Type(), lhsExtInType);
+ extsiLhs = rewriter.create<arith::ExtSIOp>(loc, targetLhsExtTy,
+ origLhsExtOp.getIn());
+ }
+ }
+ if (auto rhsExtInType =
+ origRhsExtOp.getIn().getType().dyn_cast<mlir::VectorType>()) {
+ if (rhsExtInType.getElementTypeBitWidth() <= 8) {
+ // Target rhs type with i8
+ Type targetRhsExtTy =
+ matchContainerType(rewriter.getI8Type(), rhsExtInType);
+ extsiRhs = rewriter.create<arith::ExtSIOp>(loc, targetRhsExtTy,
+ origRhsExtOp.getIn());
+ }
+ }
+
+ if (!extsiLhs || !extsiRhs) {
+ return failure();
+ }
+
+ // Collapse to 1D vectors required by smmla intrinsic
+ auto collapsedInputType = VectorType::get(
+ {16}, extsiLhs.getType().cast<ShapedType>().getElementType());
+ auto collapsedOutputType =
+ VectorType::get({4}, res.getType().cast<ShapedType>().getElementType());
+ auto collapsedLhs = rewriter.create<vector::ShapeCastOp>(
+ extsiLhs.getLoc(), collapsedInputType, extsiLhs);
+ auto collapsedRhs = rewriter.create<vector::ShapeCastOp>(
+ extsiRhs.getLoc(), collapsedInputType, extsiRhs);
+ auto collapsedRes = rewriter.create<vector::ShapeCastOp>(
+ res.getLoc(), collapsedOutputType, res);
+
+ // Replace the contract with a neon op
+ auto smmlaOp = rewriter.create<arm_neon::SmmlaOp>(
+ op.getLoc(), collapsedRes.getType(), collapsedRes, collapsedLhs,
+ collapsedRhs);
+
+ // Reshape output back to 2D
+ rewriter.replaceOpWithNewOp<vector::ShapeCastOp>(op, op.getResultType(),
+ smmlaOp);
+ return success();
+ }
+};
+
+} // namespace
+
+void mlir::arm_neon::populateLowerVectorToArmNeonPatterns(
+ RewritePatternSet &patterns) {
+ MLIRContext *context = patterns.getContext();
+ patterns.add<LowerVectorToArmNeonPattern>(context, /*benefit=*/1);
+}
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index a21bc01aa1e3ca..b787dabbbec273 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -1907,6 +1907,25 @@ cc_library(
],
)
+cc_library(
+ name = "ArmNeonTransforms",
+ srcs = ["lib/Dialect/ArmNeon/Transforms/LowerVectorToArmNeon.cpp"],
+ hdrs = ["include/mlir/Dialect/ArmNeon/Transforms.h"],
+ includes = ["include"],
+ deps = [
+ ":ArithDialect",
+ ":ArmNeonIncGen",
+ ":ArmNeonDialect",
+ ":FuncDialect",
+ ":IR",
+ ":LLVMDialect",
+ ":SideEffectInterfaces",
+ ":VectorDialect",
+ "//llvm:Core",
+ "//llvm:Support",
+ ],
+)
+
gentbl_cc_library(
name = "ArmNeonConversionIncGen",
tbl_outs = [
@@ -8593,7 +8612,9 @@ cc_library(
":ArmNeonDialect",
":ArmNeonIncGen",
":IR",
+ ":Support",
":ToLLVMIRTranslation",
+ ":Transforms",
"//llvm:Core",
"//llvm:Support",
],
More information about the Mlir-commits
mailing list