[llvm] [NFC][AMDGPU] Refactor allocatePreloadKernArgSGPRs (PR #139753)
Tim Gymnich via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 17 07:40:26 PDT 2025
https://github.com/tgymnich updated https://github.com/llvm/llvm-project/pull/139753
>From fa63d5141990305525deb4cd348ebce93ed4ae34 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Tue, 8 Apr 2025 11:39:52 +0000
Subject: [PATCH] [NFC][AMDGPU] Make allocatePreloadKernArgSGPRs usable from
both SDAG and GISel
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 9 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 157 +++++++++++-------
llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 -
3 files changed, 97 insertions(+), 70 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index c51cc2a2fe529..205bee62885ac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1213,8 +1213,6 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
uint64_t ExplicitArgOffset = 0;
const DataLayout &DL = Fn.getDataLayout();
- unsigned InIndex = 0;
-
for (const Argument &Arg : Fn.args()) {
const bool IsByRef = Arg.hasByRefAttr();
Type *BaseArgTy = Arg.getType();
@@ -1303,10 +1301,9 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
unsigned PartOffset = 0;
for (unsigned i = 0; i != NumRegs; ++i) {
- State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
- BasePartOffset + PartOffset,
- MemVT.getSimpleVT(),
- CCValAssign::Full));
+ State.addLoc(CCValAssign::getCustomMem(
+ Arg.getArgNo(), RegisterVT, BasePartOffset + PartOffset,
+ MemVT.getSimpleVT(), CCValAssign::Full));
PartOffset += MemVT.getStoreSize();
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0ced3a6ba9bc0..665a324714c9e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2561,84 +2561,115 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
// these from the dispatch pointer.
}
+static bool allocPreloadKernArg(uint64_t &LastExplicitArgOffset,
+ uint64_t ArgOffset, unsigned ArgSize,
+ unsigned Idx, MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info, CCState &CCInfo) {
+ GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
+ const Align KernelArgBaseAlign = Align(16);
+ Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
+ constexpr const unsigned SGPRSize = 4;
+ unsigned NumAllocSGPRs = alignTo(ArgSize, SGPRSize) / SGPRSize;
+
+ // Arg is preloaded into the previous SGPR.
+ if (ArgSize < SGPRSize && Alignment < SGPRSize) {
+ assert(Idx >= 1 && "No previous SGPR");
+ AMDGPUFunctionArgInfo &ArgInfo = Info.getArgInfo();
+ auto &ArgDesc = ArgInfo.PreloadKernArgs[Idx];
+ auto &PrevArgDesc = ArgInfo.PreloadKernArgs[Idx - 1];
+ ArgDesc.Regs.push_back(PrevArgDesc.Regs[0]);
+ return true;
+ }
+
+ unsigned Padding = ArgOffset - LastExplicitArgOffset;
+ unsigned PaddingSGPRs = alignTo(Padding, SGPRSize) / SGPRSize;
+ // Check for free user SGPRs for preloading.
+ if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs())
+ return false;
+
+ // Preload this argument.
+ const TargetRegisterClass *RC =
+ TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
+ SmallVectorImpl<MCRegister> *PreloadRegs =
+ Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, Idx, PaddingSGPRs);
+
+ if (PreloadRegs->size() > 1)
+ RC = &AMDGPU::SGPR_32RegClass;
+
+ for (MCRegister Reg : *PreloadRegs) {
+ assert(Reg);
+ MF.addLiveIn(Reg, RC);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ LastExplicitArgOffset = NumAllocSGPRs * SGPRSize + ArgOffset;
+ return true;
+}
+
// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
// sequential starting from the first argument.
void SITargetLowering::allocatePreloadKernArgSGPRs(
- CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
- const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
+ CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs, MachineFunction &MF,
const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
Function &F = MF.getFunction();
- unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
- GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
- bool InPreloadSequence = true;
- unsigned InIdx = 0;
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset();
+ uint64_t ExplicitArgOffset = BaseOffset;
+ uint64_t LastExplicitArgOffset = ExplicitArgOffset;
+ unsigned LocIdx = 0;
bool AlignedForImplictArgs = false;
unsigned ImplicitArgOffset = 0;
+
for (auto &Arg : F.args()) {
- if (!InPreloadSequence || !Arg.hasInRegAttr())
+ if (!Arg.hasInRegAttr())
break;
- unsigned ArgIdx = Arg.getArgNo();
- // Don't preload non-original args or parts not in the current preload
- // sequence.
- if (InIdx < Ins.size() &&
- (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
+ const bool IsByRef = Arg.hasByRefAttr();
+ Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
+ unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
+
+ if (AllocSize == 0)
break;
- for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
- Ins[InIdx].getOrigArgIndex() == ArgIdx;
- InIdx++) {
- assert(ArgLocs[ArgIdx].isMemLoc());
- auto &ArgLoc = ArgLocs[InIdx];
- const Align KernelArgBaseAlign = Align(16);
- unsigned ArgOffset = ArgLoc.getLocMemOffset();
- Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
- unsigned NumAllocSGPRs =
- alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
-
- // Fix alignment for hidden arguments.
- if (Arg.hasAttribute("amdgpu-hidden-argument")) {
- if (!AlignedForImplictArgs) {
- ImplicitArgOffset =
- alignTo(LastExplicitArgOffset,
- Subtarget->getAlignmentForImplicitArgPtr()) -
- LastExplicitArgOffset;
- AlignedForImplictArgs = true;
- }
- ArgOffset += ImplicitArgOffset;
- }
+ MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt;
+ Align ABIAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);
- // Arg is preloaded into the previous SGPR.
- if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
- assert(InIdx >= 1 && "No previous SGPR");
- Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
- Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
- continue;
- }
+ // Fix alignment for hidden arguments.
+ if (Arg.hasAttribute("amdgpu-hidden-argument") && !AlignedForImplictArgs) {
+ ImplicitArgOffset = alignTo(LastExplicitArgOffset,
+ Subtarget->getAlignmentForImplicitArgPtr()) -
+ LastExplicitArgOffset;
+ AlignedForImplictArgs = true;
+ }
- unsigned Padding = ArgOffset - LastExplicitArgOffset;
- unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
- // Check for free user SGPRs for preloading.
- if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
- InPreloadSequence = false;
- break;
- }
+ uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
+ ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
- // Preload this argument.
- const TargetRegisterClass *RC =
- TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
- SmallVectorImpl<MCRegister> *PreloadRegs =
- Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
-
- if (PreloadRegs->size() > 1)
- RC = &AMDGPU::SGPR_32RegClass;
- for (auto &Reg : *PreloadRegs) {
- assert(Reg);
- MF.addLiveIn(Reg, RC);
- CCInfo.AllocateReg(Reg);
- }
+ if (ArgLocs.empty()) {
+ // global isel
+ if (Arg.hasAttribute("amdgpu-hidden-argument"))
+ ArgOffset += ImplicitArgOffset;
- LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
+ if (!allocPreloadKernArg(LastExplicitArgOffset, ArgOffset, AllocSize,
+ Arg.getArgNo(), MF, TRI, Info, CCInfo))
+ return; // No more available SGPRs
+ } else {
+ // DAG isel
+ for (; LocIdx < ArgLocs.size() &&
+ ArgLocs[LocIdx].getValNo() == Arg.getArgNo();
+ LocIdx++) {
+ CCValAssign &ArgLoc = ArgLocs[LocIdx];
+ assert(ArgLoc.isMemLoc());
+ uint64_t LocOffset = ArgLoc.getLocMemOffset();
+ unsigned LocSize = ArgLoc.getLocVT().getStoreSize();
+ if (Arg.hasAttribute("amdgpu-hidden-argument"))
+ LocOffset += ImplicitArgOffset;
+
+ if (!allocPreloadKernArg(LastExplicitArgOffset, LocOffset, LocSize,
+ LocIdx, MF, TRI, Info, CCInfo))
+ return; // No more available SGPRs
+ }
}
}
}
@@ -2959,7 +2990,7 @@ SDValue SITargetLowering::LowerFormalArguments(
allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
if (IsKernel && Subtarget->hasKernargPreload())
- allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
+ allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, MF, *TRI, *Info);
allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
} else if (!IsGraphics) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index d71a22722129e..65cee100d63ec 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -579,7 +579,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
void allocatePreloadKernArgSGPRs(CCState &CCInfo,
SmallVectorImpl<CCValAssign> &ArgLocs,
- const SmallVectorImpl<ISD::InputArg> &Ins,
MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const;
More information about the llvm-commits
mailing list