[llvm] [LLVM][AArch64]Use load/store with consecutive registers in SME2 or S… (PR #77665)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 10 10:42:35 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: None (CarolineConcatto)
<details>
<summary>Changes</summary>
…VE2.1 for spill/fill
When possible the spill/fill register in Frame Lowering uses the ld/st consecutive pairs available in sme or sve2.1.
---
Patch is 302.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/77665.diff
5 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64FrameLowering.cpp (+78-22)
- (modified) llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h (+2)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll (+528-928)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll (+528-928)
- (added) llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ldst-pair.ll (+295)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index caab59201a8d69..7a332bfd18c21b 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1480,6 +1480,11 @@ static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
switch (I->getOpcode()) {
default:
return false;
+ case AArch64::PTRUE_C_B:
+ case AArch64::LD1B_2Z_IMM:
+ case AArch64::ST1B_2Z_IMM:
+ return I->getMF()->getSubtarget<AArch64Subtarget>().hasSVE2p1() ||
+ I->getMF()->getSubtarget<AArch64Subtarget>().hasSME2();
case AArch64::STR_ZXI:
case AArch64::STR_PXI:
case AArch64::LDR_ZXI:
@@ -2753,6 +2758,16 @@ struct RegPairInfo {
} // end anonymous namespace
+unsigned findFreePredicateAsCounterReg(MachineFunction &MF) {
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ for (MCRegister PReg :
+ {AArch64::PN8, AArch64::PN9, AArch64::PN10, AArch64::PN11, AArch64::PN12,
+ AArch64::PN13, AArch64::PN14, AArch64::PN15}) {
+ if (!MRI.isReserved(PReg))
+ return PReg;
+ }
+ llvm_unreachable("cannot find a free predicate");
+}
static void computeCalleeSaveRegisterPairs(
MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
@@ -2763,6 +2778,7 @@ static void computeCalleeSaveRegisterPairs(
bool IsWindows = isTargetWindows(MF);
bool NeedsWinCFI = needsWinCFI(MF);
+ const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
MachineFrameInfo &MFI = MF.getFrameInfo();
CallingConv::ID CC = MF.getFunction().getCallingConv();
@@ -2831,7 +2847,11 @@ static void computeCalleeSaveRegisterPairs(
RPI.Reg2 = NextReg;
break;
case RegPairInfo::PPR:
+ break;
case RegPairInfo::ZPR:
+ if (Subtarget.hasSVE2p1() || Subtarget.hasSME2())
+ if (((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1))
+ RPI.Reg2 = NextReg;
break;
}
}
@@ -2876,7 +2896,7 @@ static void computeCalleeSaveRegisterPairs(
assert(OffsetPre % Scale == 0);
if (RPI.isScalable())
- ScalableByteOffset += StackFillDir * Scale;
+ ScalableByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
else
ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
@@ -2887,9 +2907,6 @@ static void computeCalleeSaveRegisterPairs(
(IsWindows && RPI.Reg2 == AArch64::LR)))
ByteOffset += StackFillDir * 8;
- assert(!(RPI.isScalable() && RPI.isPaired()) &&
- "Paired spill/fill instructions don't exist for SVE vectors");
-
// Round up size of non-pair to pair size if we need to pad the
// callee-save area to ensure 16-byte alignment.
if (NeedGapToAlignStack && !NeedsWinCFI &&
@@ -2976,6 +2993,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
}
return true;
}
+ bool PtrueCreated = false;
for (const RegPairInfo &RPI : llvm::reverse(RegPairs)) {
unsigned Reg1 = RPI.Reg1;
unsigned Reg2 = RPI.Reg2;
@@ -3010,10 +3028,10 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
Alignment = Align(16);
break;
case RegPairInfo::ZPR:
- StrOpc = AArch64::STR_ZXI;
- Size = 16;
- Alignment = Align(16);
- break;
+ StrOpc = RPI.isPaired() ? AArch64::ST1B_2Z_IMM : AArch64::STR_ZXI;
+ Size = 16;
+ Alignment = Align(16);
+ break;
case RegPairInfo::PPR:
StrOpc = AArch64::STR_PXI;
Size = 2;
@@ -3037,19 +3055,37 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
std::swap(Reg1, Reg2);
std::swap(FrameIdxReg1, FrameIdxReg2);
}
+
+ unsigned PnReg;
+ unsigned PairRegs;
+ if (RPI.isPaired() && RPI.isScalable()) {
+ PnReg = findFreePredicateAsCounterReg(MF);
+ PairRegs = AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0);
+ if (!PtrueCreated) {
+ PtrueCreated = true;
+ BuildMI(MBB, MI, DL, TII.get(AArch64::PTRUE_C_B), PnReg)
+ .setMIFlags(MachineInstr::FrameDestroy);
+ }
+ }
MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
if (!MRI.isReserved(Reg1))
MBB.addLiveIn(Reg1);
if (RPI.isPaired()) {
if (!MRI.isReserved(Reg2))
MBB.addLiveIn(Reg2);
- MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
+ if (RPI.isScalable())
+ MIB.addReg(PairRegs);
+ else
+ MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
MachineMemOperand::MOStore, Size, Alignment));
}
- MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
- .addReg(AArch64::SP)
+ if (RPI.isPaired() && RPI.isScalable())
+ MIB.addReg(PnReg);
+ else
+ MIB.addReg(Reg1, getPrologueDeath(MF, Reg1));
+ MIB.addReg(AArch64::SP)
.addImm(RPI.Offset) // [sp, #offset*scale],
// where factor*scale is implicit
.setMIFlag(MachineInstr::FrameSetup);
@@ -3061,8 +3097,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
// Update the StackIDs of the SVE stack slots.
MachineFrameInfo &MFI = MF.getFrameInfo();
- if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR)
- MFI.setStackID(RPI.FrameIdx, TargetStackID::ScalableVector);
+ if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR) {
+ MFI.setStackID(FrameIdxReg1, TargetStackID::ScalableVector);
+ if (RPI.isPaired())
+ MFI.setStackID(FrameIdxReg2, TargetStackID::ScalableVector);
+ }
}
return true;
@@ -3082,7 +3121,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
- auto EmitMI = [&](const RegPairInfo &RPI) -> MachineBasicBlock::iterator {
+ auto EmitMI = [&](const RegPairInfo &RPI,
+ bool *PtrueCreated) -> MachineBasicBlock::iterator {
unsigned Reg1 = RPI.Reg1;
unsigned Reg2 = RPI.Reg2;
@@ -3114,7 +3154,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
Alignment = Align(16);
break;
case RegPairInfo::ZPR:
- LdrOpc = AArch64::LDR_ZXI;
+ LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI;
Size = 16;
Alignment = Align(16);
break;
@@ -3139,15 +3179,31 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
std::swap(Reg1, Reg2);
std::swap(FrameIdxReg1, FrameIdxReg2);
}
+
+ unsigned PnReg;
+ unsigned PairRegs;
+ if (RPI.isPaired() && RPI.isScalable()) {
+ PnReg = findFreePredicateAsCounterReg(MF);
+ PairRegs = AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0);
+ if (!*PtrueCreated) {
+ *PtrueCreated = true;
+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::PTRUE_C_B), PnReg)
+ .setMIFlags(MachineInstr::FrameDestroy);
+ }
+ }
+
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc));
if (RPI.isPaired()) {
- MIB.addReg(Reg2, getDefRegState(true));
+ MIB.addReg(RPI.isScalable() ? PairRegs : Reg2, getDefRegState(true));
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
MachineMemOperand::MOLoad, Size, Alignment));
}
- MIB.addReg(Reg1, getDefRegState(true))
- .addReg(AArch64::SP)
+ if (RPI.isPaired() && RPI.isScalable())
+ MIB.addReg(PnReg);
+ else
+ MIB.addReg(Reg1, getDefRegState(true));
+ MIB.addReg(AArch64::SP)
.addImm(RPI.Offset) // [sp, #offset*scale]
// where factor*scale is implicit
.setMIFlag(MachineInstr::FrameDestroy);
@@ -3161,9 +3217,10 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
};
// SVE objects are always restored in reverse order.
+ bool PtrueCreated = false;
for (const RegPairInfo &RPI : reverse(RegPairs))
if (RPI.isScalable())
- EmitMI(RPI);
+ EmitMI(RPI, &PtrueCreated);
if (homogeneousPrologEpilog(MF, &MBB)) {
auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
@@ -3174,13 +3231,12 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
}
return true;
}
-
if (ReverseCSRRestoreSeq) {
MachineBasicBlock::iterator First = MBB.end();
for (const RegPairInfo &RPI : reverse(RegPairs)) {
if (RPI.isScalable())
continue;
- MachineBasicBlock::iterator It = EmitMI(RPI);
+ MachineBasicBlock::iterator It = EmitMI(RPI, &PtrueCreated);
if (First == MBB.end())
First = It;
}
@@ -3190,7 +3246,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
for (const RegPairInfo &RPI : RegPairs) {
if (RPI.isScalable())
continue;
- (void)EmitMI(RPI);
+ (void)EmitMI(RPI, &PtrueCreated);
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index cd4a18bfbc23a8..b44cc8d0d0dc9b 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -300,6 +300,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
int FrameIdx = Info.getFrameIdx();
if (MFI.getStackID(FrameIdx) != TargetStackID::Default)
continue;
+ if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector)
+ continue;
int64_t Offset = MFI.getObjectOffset(FrameIdx);
int64_t ObjSize = MFI.getObjectSize(FrameIdx);
MinOffset = std::min<int64_t>(Offset, MinOffset);
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
index b7119fc0825673..6c94546c9525aa 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
@@ -55,45 +55,31 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
; STRIDED-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; STRIDED-NEXT: ret
@@ -103,20 +89,14 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-16
; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
; CONTIGUOUS-NEXT: mov p8.b, p0.b
@@ -130,21 +110,15 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ptrue pn8.b
+; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -210,45 +184,31 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
-; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STR...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/77665
More information about the llvm-commits
mailing list