[llvm] [AMDGPU] Add support for preloading hidden groupsize args (PR #83817)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 4 02:13:51 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-ir
Author: Austin Kerbow (kerbowa)
<details>
<summary>Changes</summary>
WIP
---
Patch is 42.30 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/83817.diff
8 Files Affected:
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+13-9)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h (+1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp (+122-2)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+124-3)
- (modified) llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp (+22-1)
- (modified) llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h (+21-2)
- (modified) llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll (+2)
- (added) llvm/test/CodeGen/AMDGPU/preload-implict-kernargs.ll (+528)
``````````diff
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 0f29653f1f5bec..84b0cde1982558 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -12,8 +12,8 @@
def global_ptr_ty : LLVMQualPointerType<1>;
-class AMDGPUReadPreloadRegisterIntrinsic
- : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
+class AMDGPUReadPreloadRegisterIntrinsic<LLVMType type>
+ : DefaultAttrsIntrinsic<[type], [], [IntrNoMem, IntrSpeculatable]>;
class AMDGPUReadPreloadRegisterIntrinsicNamed<string name>
: DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>, ClangBuiltin<name>;
@@ -27,10 +27,10 @@ class AMDGPURsrcIntrinsic<int rsrcarg, bit isimage = false> {
let TargetPrefix = "r600" in {
-multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz {
- def _x : AMDGPUReadPreloadRegisterIntrinsic;
- def _y : AMDGPUReadPreloadRegisterIntrinsic;
- def _z : AMDGPUReadPreloadRegisterIntrinsic;
+multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz<LLVMType type> {
+ def _x : AMDGPUReadPreloadRegisterIntrinsic<type>;
+ def _y : AMDGPUReadPreloadRegisterIntrinsic<type>;
+ def _z : AMDGPUReadPreloadRegisterIntrinsic<type>;
}
multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz_named<string prefix> {
@@ -46,8 +46,8 @@ defm int_r600_read_ngroups : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
defm int_r600_read_tgid : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
<"__builtin_r600_read_tgid">;
-defm int_r600_read_local_size : AMDGPUReadPreloadRegisterIntrinsic_xyz;
-defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz;
+defm int_r600_read_local_size : AMDGPUReadPreloadRegisterIntrinsic_xyz<llvm_i32_ty>;
+defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz<llvm_i32_ty>;
def int_r600_group_barrier : ClangBuiltin<"__builtin_r600_group_barrier">,
Intrinsic<[], [], [IntrConvergent, IntrWillReturn]>;
@@ -138,10 +138,14 @@ let TargetPrefix = "amdgcn" in {
// ABI Special Intrinsics
//===----------------------------------------------------------------------===//
-defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz;
+defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz<llvm_i32_ty>;
defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
<"__builtin_amdgcn_workgroup_id">;
+// Intened to be used when preloading implicit kernel arguments.
+defm int_amdgcn_workgroup_size :
+ AMDGPUReadPreloadRegisterIntrinsic_xyz<llvm_i16_ty>;
+
def int_amdgcn_dispatch_ptr :
DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [],
[Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index 42b33c50d9f8c4..e6aed12673c941 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -95,6 +95,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
struct KernArgPreloadDescriptor : public ArgDescriptor {
KernArgPreloadDescriptor() {}
SmallVector<MCRegister> Regs;
+ unsigned ByteOffset;
};
struct AMDGPUFunctionArgInfo {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index bc58407a73294c..03544279b49efe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -13,8 +13,10 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/Target/TargetMachine.h"
@@ -31,9 +33,13 @@ class PreloadKernelArgInfo {
const GCNSubtarget &ST;
unsigned NumFreeUserSGPRs;
-public:
- SmallVector<llvm::Metadata *, 8> KernelArgMetadata;
+ enum ImplicitArgOffsets {
+ HIDDEN_GROUP_SIZE_X_OFFSET = 12,
+ HIDDEN_GROUP_SIZE_Y_OFFSET = 14,
+ HIDDEN_GROUP_SIZE_Z_OFFSET = 16,
+ };
+public:
PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
setInitialFreeUserSGPRsCount();
}
@@ -64,6 +70,111 @@ class PreloadKernelArgInfo {
NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
return true;
}
+
+ // Try to allocate SGPRs to preload implicit kernel arguments.
+ void tryAllocImplicitArgPreloadSGPRs(unsigned ImplicitArgsBaseOffset,
+ IRBuilder<> &Builder) {
+ unsigned LastExplicitArgOffset = ImplicitArgsBaseOffset;
+ IntrinsicInst *ImplicitArgPtr = nullptr;
+ for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) {
+ for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ++I) {
+ if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(I))
+ if (CI->getIntrinsicID() == Intrinsic::amdgcn_implicitarg_ptr) {
+ ImplicitArgPtr = CI;
+ break;
+ }
+ }
+ }
+ if (!ImplicitArgPtr)
+ return;
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ Value *GroupSizes[3] = {nullptr, nullptr, nullptr};
+ for (auto *U : ImplicitArgPtr->users()) {
+ if (!U->hasOneUse())
+ continue;
+
+ // FIXME: The loop below is mostly copied from
+ // AMDGPULowerKernelAttributes.cpp, should combine the logic somewhere.
+ int64_t Offset = 0;
+ auto *Load =
+ dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr/DispatchPtr?
+ auto *BCI = dyn_cast<BitCastInst>(U);
+ if (!Load && !BCI) {
+ if (GetPointerBaseWithConstantOffset(U, Offset, DL) != ImplicitArgPtr)
+ continue;
+ Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
+ BCI = dyn_cast<BitCastInst>(*U->user_begin());
+ }
+
+ if (BCI) {
+ if (!BCI->hasOneUse())
+ continue;
+ Load = dyn_cast<LoadInst>(*BCI->user_begin()); // Load from BCI?
+ }
+
+ if (!Load || !Load->isSimple())
+ continue;
+
+ unsigned LoadSize = DL.getTypeStoreSize(Load->getType());
+ switch (Offset) {
+ case HIDDEN_GROUP_SIZE_X_OFFSET:
+ if (LoadSize == 2)
+ GroupSizes[0] = Load;
+ break;
+ case HIDDEN_GROUP_SIZE_Y_OFFSET:
+ if (LoadSize == 2)
+ GroupSizes[1] = Load;
+ break;
+ case HIDDEN_GROUP_SIZE_Z_OFFSET:
+ if (LoadSize == 2)
+ GroupSizes[2] = Load;
+ break;
+ default:
+ break;
+ }
+ }
+
+ // If we fail to preload any implicit argument we know we don't have SGPRs
+ // to preload any subsequent ones with larger offsets.
+ if (GroupSizes[0]) {
+ if (!tryAllocPreloadSGPRs(
+ 2, ImplicitArgsBaseOffset + HIDDEN_GROUP_SIZE_X_OFFSET,
+ LastExplicitArgOffset))
+ return;
+ LastExplicitArgOffset =
+ ImplicitArgsBaseOffset + HIDDEN_GROUP_SIZE_X_OFFSET + 2;
+ CallInst *CI =
+ Builder.CreateIntrinsic(Intrinsic::amdgcn_workgroup_size_x, {}, {});
+ GroupSizes[0]->replaceAllUsesWith(CI);
+ F.addFnAttr("amdgpu-preload-work-group-size-x");
+ }
+
+ if (GroupSizes[1]) {
+ if (!tryAllocPreloadSGPRs(
+ 2, ImplicitArgsBaseOffset + HIDDEN_GROUP_SIZE_Y_OFFSET,
+ LastExplicitArgOffset))
+ return;
+ LastExplicitArgOffset =
+ ImplicitArgsBaseOffset + HIDDEN_GROUP_SIZE_Y_OFFSET + 2;
+ CallInst *CI =
+ Builder.CreateIntrinsic(Intrinsic::amdgcn_workgroup_size_y, {}, {});
+ GroupSizes[1]->replaceAllUsesWith(CI);
+ F.addFnAttr("amdgpu-preload-work-group-size-y");
+ }
+
+ if (GroupSizes[2]) {
+ if (!tryAllocPreloadSGPRs(
+ 2, ImplicitArgsBaseOffset + HIDDEN_GROUP_SIZE_Z_OFFSET,
+ LastExplicitArgOffset))
+ return;
+ LastExplicitArgOffset =
+ ImplicitArgsBaseOffset + HIDDEN_GROUP_SIZE_Z_OFFSET + 2;
+ CallInst *CI =
+ Builder.CreateIntrinsic(Intrinsic::amdgcn_workgroup_size_z, {}, {});
+ GroupSizes[2]->replaceAllUsesWith(CI);
+ F.addFnAttr("amdgpu-preload-work-group-size-z");
+ }
+ }
};
class AMDGPULowerKernelArguments : public FunctionPass {
@@ -282,6 +393,15 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
KernArgSegment->addRetAttr(
Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
+ if (InPreloadSequence) {
+ // Alignment for first implicit arg is 4 from hidden_block_count_x.
+ const unsigned FirstImplicitArgAlignment = 4;
+ uint64_t ImplicitArgsBaseOffset =
+ alignTo(ExplicitArgOffset, Align(FirstImplicitArgAlignment)) +
+ BaseOffset;
+ PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset, Builder);
+ }
+
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 84ef9679ab9563..2765df6bc7fdae 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2444,8 +2444,8 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
// these from the dispatch pointer.
}
-// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
-// sequential starting from the first argument.
+// Allocate pre-loaded kernel arguments. Preloaded arguments must be
+// sequential and preloading must also start from the first argument.
void SITargetLowering::allocatePreloadKernArgSGPRs(
CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
@@ -2456,6 +2456,7 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
bool InPreloadSequence = true;
unsigned InIdx = 0;
+ const Align KernelArgBaseAlign = Align(16);
for (auto &Arg : F.args()) {
if (!InPreloadSequence || !Arg.hasInRegAttr())
break;
@@ -2472,7 +2473,6 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
InIdx++) {
assert(ArgLocs[ArgIdx].isMemLoc());
auto &ArgLoc = ArgLocs[InIdx];
- const Align KernelArgBaseAlign = Align(16);
unsigned ArgOffset = ArgLoc.getLocMemOffset();
Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
unsigned NumAllocSGPRs =
@@ -2511,6 +2511,88 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
}
}
+
+ if (Info.hasWorkGroupSizeX() || Info.hasWorkGroupSizeY() ||
+ Info.hasWorkGroupSizeZ()) {
+ unsigned ImplicitArgsBaseOffset = 0;
+ unsigned ImplictArgsBaseIdx = MF.getFunction().arg_size();
+ for (auto &Arg : MF.getFunction().args()) {
+ Type *Ty;
+ MaybeAlign Align;
+ if (Arg.hasByRefAttr()) {
+ Ty = Arg.getParamByRefType();
+ Align = Arg.getParamAlign();
+ } else {
+ Ty = Arg.getType();
+ Align = MF.getDataLayout().getABITypeAlign(Ty);
+ }
+ auto Size = MF.getDataLayout().getTypeAllocSize(Ty);
+ ImplicitArgsBaseOffset = alignTo(ImplicitArgsBaseOffset, *Align);
+ ImplicitArgsBaseOffset += Size;
+ }
+ unsigned ImplicitArgBaseSGPROffset = alignTo(ImplicitArgsBaseOffset, 4) / 4;
+ assert(ImplicitArgBaseSGPROffset <
+ AMDGPU::getMaxNumUserSGPRs(MF.getSubtarget()));
+ Info.allocateUserSGPRs(ImplicitArgBaseSGPROffset);
+
+ unsigned AllocatedSGPRs = ImplicitArgBaseSGPROffset;
+ // FIXME: Create helper/refactor.
+ if (Info.hasWorkGroupSizeX()) {
+ unsigned Offset = ImplicitArgsBaseOffset + 12;
+ unsigned ImplictArgIdx = ImplictArgsBaseIdx + 3;
+ Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
+ unsigned Padding = alignTo(Offset, 4) / 4 - AllocatedSGPRs;
+ if (Alignment < 4)
+ Padding -= 1;
+ // Byte offset for data in preload SGPRs.
+ unsigned ByteOffset = Alignment < 4 ? 2 : 0;
+ SmallVectorImpl<MCRegister> *PreloadRegs = Info.addPreloadedKernArg(
+ TRI, &AMDGPU::SReg_32RegClass, 1, ImplictArgIdx, Padding, ByteOffset);
+
+ MCRegister Reg = PreloadRegs->front();
+ assert(Reg);
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info.hasWorkGroupSizeY()) {
+ unsigned Offset = ImplicitArgsBaseOffset + 14;
+ unsigned ImplictArgIdx = ImplictArgsBaseIdx + 4;
+ Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
+ unsigned Padding = alignTo(Offset, 4) / 4 - AllocatedSGPRs;
+ if (Alignment < 4)
+ Padding -= 1;
+
+ // Byte offset for data in preload SGPRs.
+ unsigned ByteOffset = Alignment < 4 ? 2 : 0;
+ SmallVectorImpl<MCRegister> *PreloadRegs = Info.addPreloadedKernArg(
+ TRI, &AMDGPU::SReg_32RegClass, 1, ImplictArgIdx, Padding, ByteOffset);
+
+ MCRegister Reg = PreloadRegs->front();
+ assert(Reg);
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info.hasWorkGroupSizeZ()) {
+ unsigned Offset = ImplicitArgsBaseOffset + 16;
+ unsigned ImplictArgIdx = ImplictArgsBaseIdx + 5;
+ Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
+ unsigned Padding = alignTo(Offset, 4) / 4 - AllocatedSGPRs;
+ if (Alignment < 4)
+ Padding -= 1;
+
+ // Byte offset for data in preload SGPRs.
+ unsigned ByteOffset = Alignment < 4 ? 2 : 0;
+ SmallVectorImpl<MCRegister> *PreloadRegs = Info.addPreloadedKernArg(
+ TRI, &AMDGPU::SReg_32RegClass, 1, ImplictArgIdx, Padding, ByteOffset);
+
+ MCRegister Reg = PreloadRegs->front();
+ assert(Reg);
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
+ }
}
void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF,
@@ -8325,6 +8407,45 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
IndexKeyi32, Op.getOperand(7)});
}
+ case Intrinsic::amdgcn_workgroup_size_x:
+ case Intrinsic::amdgcn_workgroup_size_y:
+ case Intrinsic::amdgcn_workgroup_size_z: {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ assert(ST.hasKernargPreload());
+ SDLoc DL(Op);
+ unsigned ImplictArgsBaseIdx = MF.getFunction().arg_size();
+ unsigned ImplictArgIdx = ImplictArgsBaseIdx;
+ switch (IntrinsicID) {
+ case Intrinsic::amdgcn_workgroup_size_x:
+ ImplictArgIdx = ImplictArgsBaseIdx + 3;
+ break;
+ case Intrinsic::amdgcn_workgroup_size_y:
+ ImplictArgIdx = ImplictArgsBaseIdx + 4;
+ break;
+ case Intrinsic::amdgcn_workgroup_size_z:
+ ImplictArgIdx = ImplictArgsBaseIdx + 5;
+ break;
+ }
+
+ auto &ArgInfo = MFI->getArgInfo()
+ .PreloadKernArgs.find(ImplictArgIdx)
+ ->getSecond();
+ Register Reg = ArgInfo.Regs[0];
+ unsigned ByteOffset = ArgInfo.ByteOffset;
+ Register VReg = MRI.getLiveInVirtReg(Reg);
+ SDValue Preload =
+ DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i32);
+ if (ByteOffset == 0) {
+ Preload = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Preload);
+ } else {
+ SDValue ShiftAmt = DAG.getConstant(16, DL, MVT::i32);
+ SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Preload, ShiftAmt);
+
+ Preload = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Extract);
+ }
+ return Preload;
+ }
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 52d6fe6c7ba51c..0a85af0f5ac1c3 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -42,6 +42,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false),
PrivateSegmentWaveByteOffset(false), WorkItemIDX(false),
WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false),
+ WorkGroupSizeX(false), WorkGroupSizeY(false), WorkGroupSizeZ(false),
GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) {
const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
@@ -58,6 +59,15 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
if (IsKernel) {
WorkGroupIDX = true;
WorkItemIDX = true;
+ if (F.hasFnAttribute("amdgpu-preload-work-group-size-x"))
+ WorkGroupSizeX = true;
+
+ if (F.hasFnAttribute("amdgpu-preload-work-group-size-y"))
+ WorkGroupSizeY = true;
+
+ if (F.hasFnAttribute("amdgpu-preload-work-group-size-z"))
+ WorkGroupSizeZ = true;
+
} else if (CC == CallingConv::AMDGPU_PS) {
PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
}
@@ -245,7 +255,8 @@ Register SIMachineFunctionInfo::addLDSKernelId() {
SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
const SIRegisterInfo &TRI, const TargetRegisterClass *RC,
- unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs) {
+ unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs,
+ unsigned ByteOffset) {
assert(!ArgInfo.PreloadKernArgs.count(KernArgIdx) &&
"Preload kernel argument allocated twice.");
NumUserSGPRs += PaddingSGPRs;
@@ -254,6 +265,7 @@ SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
// merge them.
Register PreloadReg =
TRI.getMatchingSuperReg(getNextUserSGPR(), AMDGPU::sub0, RC);
+ ArgInfo.PreloadKernArgs[KernArgIdx].ByteOffset = ByteOffset;
if (PreloadReg &&
(RC == &AMDGPU::SReg_32RegClass || RC == &AMDGPU::SReg_64RegClass)) {
ArgInfo.PreloadKernArgs[KernArgIdx].Regs.push_back(PreloadReg);
@@ -270,6 +282,15 @@ SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
return &ArgInfo.PreloadKernArgs[KernArgIdx].Regs;
}
+bool SIMachineFunctionInfo::allocateUserSGPRs(
+ unsigned Number) {
+ if (Number <= getNumUserSGPRs())
+ return false;
+
+ NumUserSGPRs = Number;
+ return true;
+}
+
void SIMachineFunctionInfo::allocateWWMSpill(MachineFunction &MF, Register VGPR,
uint64_t Size, Align Alignment) {
// Skip if it is an entry function or the register is already added.
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 0336ec4985ea74..64dc7e78a94186 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -458,6 +458,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
// user arguments. This is an offset from the KernargSegmentPtr.
bool ImplicitArgPtr : 1;
+ bool WorkGroupSizeX : 1;
+ bool WorkGroupSizeY : 1;
+ bool WorkGroupSizeZ : 1;
+
bool MayNeedAGPRs : 1;
// The hard-wired high half of the address of the global information table
@@ -740,8 +744,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
Register addLDSKernelId();
SmallVectorImpl<MCRegister> *
addPreloadedKernArg(const SIRegisterInfo &TRI, const TargetRegisterClass *RC,
- unsigned AllocSizeDWord, int KernArgIdx,
- int PaddingSGPRs);
+ unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs,
+ unsigned Offset = 0);
+
+ /// Reserve up to \p Number of user SGPRs.
+ bool allocateUserSGPRs(unsigned Number);
/// Increment user SGPRs used for padding the argument list only.
Register addReservedUserSGPR() {
@@ -837,6 +844,18 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
return ImplicitArgPtr;
}
+ bool hasWorkGroupSizeX() const {
+ return WorkGroupSizeX;
+ }
+
+ bool hasWorkGroupSizeY() const {
+ return WorkGroupSizeY;
+ }
+
+ bool hasWorkGroupSizeZ() const {
+ return WorkGroupSizeZ;
+ }
+
AMDGPUFunctionArgI...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/83817
More information about the llvm-commits
mailing list