[llvm] [AArch64][SME] Disable hazard padding when there is only PPRs and GPRs (PR #137817)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 29 07:15:36 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Benjamin Maxwell (MacDue)
<details>
<summary>Changes</summary>
Both PPR and GPR accesses occur on the CPU; adding hazard padding in this case is unnecessary.
---
Full diff: https://github.com/llvm/llvm-project/pull/137817.diff
2 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64FrameLowering.cpp (+16-3)
- (modified) llvm/test/CodeGen/AArch64/stack-hazard.ll (+18-66)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 78ac57e3e92a6..33d5bb59b83ff 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -3533,6 +3533,12 @@ static std::optional<int> getLdStFrameID(const MachineInstr &MI,
return getMMOFrameID(*MI.memoperands_begin(), MFI);
}
+/// Returns true if PPRs are spilled as ZPRs.
+static bool arePPRsSpilledAsZPR(const MachineFunction &MF) {
+ return MF.getSubtarget().getRegisterInfo()->getSpillSize(
+ AArch64::PPRRegClass) == 16;
+}
+
// Check if a Hazard slot is needed for the current function, and if so create
// one for it. The index is stored in AArch64FunctionInfo->StackHazardSlotIndex,
// which can be used to determine if any hazard padding is needed.
@@ -3552,11 +3558,12 @@ void AArch64FrameLowering::determineStackHazardSlot(
// Add a hazard slot if there are any CSR FPR registers, or are any fp-only
// stack objects.
- bool HasFPRCSRs = any_of(SavedRegs.set_bits(), [](unsigned Reg) {
+ bool ZPRPPRSpills = arePPRsSpilledAsZPR(MF);
+ bool HasFPRCSRs = any_of(SavedRegs.set_bits(), [ZPRPPRSpills](unsigned Reg) {
return AArch64::FPR64RegClass.contains(Reg) ||
AArch64::FPR128RegClass.contains(Reg) ||
AArch64::ZPRRegClass.contains(Reg) ||
- AArch64::PPRRegClass.contains(Reg);
+ (ZPRPPRSpills && AArch64::PPRRegClass.contains(Reg));
});
bool HasFPRStackObjects = false;
if (!HasFPRCSRs) {
@@ -5066,6 +5073,8 @@ void AArch64FrameLowering::orderFrameObjects(
// Identify FPR vs GPR slots for hazards, and stack slots that are tagged at
// the same time.
GroupBuilder GB(FrameObjects);
+
+ bool ZPRPPRSPills = arePPRsSpilledAsZPR(MF);
for (auto &MBB : MF) {
for (auto &MI : MBB) {
if (MI.isDebugInstr())
@@ -5074,7 +5083,11 @@ void AArch64FrameLowering::orderFrameObjects(
if (AFI.hasStackHazardSlotIndex()) {
std::optional<int> FI = getLdStFrameID(MI, MFI);
if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) {
- if (MFI.getStackID(*FI) == TargetStackID::ScalableVector ||
+ // Note: PPR accesses should be treated as GPR accesses for the
+ // purposes of stack hazards as PPR ld/sts are handled on the CPU.
+ if ((MFI.getStackID(*FI) == TargetStackID::ScalableVector &&
+ (ZPRPPRSPills ||
+ !AArch64::PPRRegClass.contains(MI.getOperand(0).getReg()))) ||
AArch64InstrInfo::isFpOrNEON(MI))
FrameObjects[*FI].Accesses |= FrameObject::AccessFPR;
else
diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll
index 791d7580c327d..784d1b51093ec 100644
--- a/llvm/test/CodeGen/AArch64/stack-hazard.ll
+++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll
@@ -1497,72 +1497,24 @@ define [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1([2 x <vscale x 4 x i1>
}
define [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1_caller([2 x <vscale x 4 x i1>] %arg1, [2 x <vscale x 4 x i1>] %arg2) nounwind "aarch64_pstate_sm_compatible" {
-; CHECK0-LABEL: sve_signature_pred_2xv4i1_caller:
-; CHECK0: // %bb.0:
-; CHECK0-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK0-NEXT: addvl sp, sp, #-1
-; CHECK0-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT: mov p5.b, p0.b
-; CHECK0-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT: mov p4.b, p1.b
-; CHECK0-NEXT: mov p0.b, p2.b
-; CHECK0-NEXT: mov p1.b, p3.b
-; CHECK0-NEXT: mov p2.b, p5.b
-; CHECK0-NEXT: mov p3.b, p4.b
-; CHECK0-NEXT: bl sve_signature_pred_2xv4i1
-; CHECK0-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT: addvl sp, sp, #1
-; CHECK0-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK0-NEXT: ret
-;
-; CHECK64-LABEL: sve_signature_pred_2xv4i1_caller:
-; CHECK64: // %bb.0:
-; CHECK64-NEXT: sub sp, sp, #80
-; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK64-NEXT: addvl sp, sp, #-1
-; CHECK64-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK64-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK64-NEXT: sub sp, sp, #64
-; CHECK64-NEXT: mov p4.b, p1.b
-; CHECK64-NEXT: mov p5.b, p0.b
-; CHECK64-NEXT: mov p0.b, p2.b
-; CHECK64-NEXT: mov p1.b, p3.b
-; CHECK64-NEXT: mov p2.b, p5.b
-; CHECK64-NEXT: mov p3.b, p4.b
-; CHECK64-NEXT: bl sve_signature_pred_2xv4i1
-; CHECK64-NEXT: add sp, sp, #64
-; CHECK64-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT: addvl sp, sp, #1
-; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK64-NEXT: add sp, sp, #80
-; CHECK64-NEXT: ret
-;
-; CHECK1024-LABEL: sve_signature_pred_2xv4i1_caller:
-; CHECK1024: // %bb.0:
-; CHECK1024-NEXT: sub sp, sp, #1040
-; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT: addvl sp, sp, #-1
-; CHECK1024-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: sub sp, sp, #1024
-; CHECK1024-NEXT: mov p4.b, p1.b
-; CHECK1024-NEXT: mov p5.b, p0.b
-; CHECK1024-NEXT: mov p0.b, p2.b
-; CHECK1024-NEXT: mov p1.b, p3.b
-; CHECK1024-NEXT: mov p2.b, p5.b
-; CHECK1024-NEXT: mov p3.b, p4.b
-; CHECK1024-NEXT: bl sve_signature_pred_2xv4i1
-; CHECK1024-NEXT: add sp, sp, #1024
-; CHECK1024-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: addvl sp, sp, #1
-; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT: add sp, sp, #1040
-; CHECK1024-NEXT: ret
+; CHECK-LABEL: sve_signature_pred_2xv4i1_caller:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p5.b, p0.b
+; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p4.b, p1.b
+; CHECK-NEXT: mov p0.b, p2.b
+; CHECK-NEXT: mov p1.b, p3.b
+; CHECK-NEXT: mov p2.b, p5.b
+; CHECK-NEXT: mov p3.b, p4.b
+; CHECK-NEXT: bl sve_signature_pred_2xv4i1
+; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
%res = call [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1([2 x <vscale x 4 x i1>] %arg2, [2 x <vscale x 4 x i1>] %arg1)
ret [2 x <vscale x 4 x i1>] %res
}
``````````
</details>
https://github.com/llvm/llvm-project/pull/137817
More information about the llvm-commits
mailing list