[llvm] [AMDGPU] Move kernarg preload logic to separate pass (PR #130434)
Shilei Tian via llvm-commits
llvm-commits at lists.llvm.org
Sat Mar 8 18:42:49 PST 2025
================
@@ -0,0 +1,358 @@
+//===- AMDGPUPreloadKernelArguments.cpp - Preload Kernel Arguments --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass preloads kernel arguments into user_data SGPRs before kernel
+/// execution begins. The number of registers available for preloading depends
+/// on the number of free user SGPRs, up to the hardware's maximum limit.
+/// Implicit arguments enabled in the kernel descriptor are allocated first,
+/// followed by SGPRs used for preloaded kernel arguments. (Reference:
+/// https://llvm.org/docs/AMDGPUUsage.html#initial-kernel-execution-state)
+/// Additionally, hidden kernel arguments may be preloaded, in which case they
+/// are appended to the kernel signature after explicit arguments. Preloaded
+/// arguments will be marked with `inreg`.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Pass.h"
+
+#define DEBUG_TYPE "amdgpu-preload-kernel-arguments"
+
+using namespace llvm;
+
+static cl::opt<unsigned> KernargPreloadCount(
+ "amdgpu-kernarg-preload-count",
+ cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0));
+
+namespace {
+
+class AMDGPUPreloadKernelArgumentsLegacy : public ModulePass {
+ const AMDGPUTargetMachine *TM;
+
+public:
+ static char ID;
+ explicit AMDGPUPreloadKernelArgumentsLegacy(
+ const AMDGPUTargetMachine *TM = nullptr);
+
+ StringRef getPassName() const override {
+ return "AMDGPU Preload Kernel Arguments";
+ }
+
+ bool runOnModule(Module &M) override;
+};
+
+class PreloadKernelArgInfo {
+private:
+ Function &F;
+ const GCNSubtarget &ST;
+ unsigned NumFreeUserSGPRs;
+
+ enum HiddenArg : unsigned {
+ HIDDEN_BLOCK_COUNT_X,
+ HIDDEN_BLOCK_COUNT_Y,
+ HIDDEN_BLOCK_COUNT_Z,
+ HIDDEN_GROUP_SIZE_X,
+ HIDDEN_GROUP_SIZE_Y,
+ HIDDEN_GROUP_SIZE_Z,
+ HIDDEN_REMAINDER_X,
+ HIDDEN_REMAINDER_Y,
+ HIDDEN_REMAINDER_Z,
+ END_HIDDEN_ARGS
+ };
+
+ // Stores information about a specific hidden argument.
+ struct HiddenArgInfo {
+ // Offset in bytes from the location in the kernearg segment pointed to by
+ // the implicitarg pointer.
+ uint8_t Offset;
+ // The size of the hidden argument in bytes.
+ uint8_t Size;
+ // The name of the hidden argument in the kernel signature.
+ const char *Name;
+ };
+
+ static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
+ {0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},
+ {8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},
+ {14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},
+ {18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"},
+ {22, 2, "_hidden_remainder_z"}};
+
+ static HiddenArg getHiddenArgFromOffset(unsigned Offset) {
+ for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
+ if (HiddenArgs[I].Offset == Offset)
+ return static_cast<HiddenArg>(I);
+
+ return END_HIDDEN_ARGS;
+ }
+
+ static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
+ if (HA < END_HIDDEN_ARGS)
+ return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8);
+
+ llvm_unreachable("Unexpected hidden argument.");
+ }
+
+ static const char *getHiddenArgName(HiddenArg HA) {
+ if (HA < END_HIDDEN_ARGS) {
+ return HiddenArgs[HA].Name;
+ }
+ llvm_unreachable("Unexpected hidden argument.");
+ }
+
+ // Clones the function after adding implicit arguments to the argument list
+ // and returns the new updated function. Preloaded implicit arguments are
+ // added up to and including the last one that will be preloaded, indicated by
+ // LastPreloadIndex. Currently preloading is only performed on the totality of
+ // sequential data from the kernarg segment including implicit (hidden)
+ // arguments. This means that all arguments up to the last preloaded argument
+ // will also be preloaded even if that data is unused.
+ Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {
+ FunctionType *FT = F.getFunctionType();
+ LLVMContext &Ctx = F.getParent()->getContext();
+ SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end());
+ for (unsigned I = 0; I <= LastPreloadIndex; ++I)
+ FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));
+
+ FunctionType *NFT =
+ FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());
+ Function *NF =
+ Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName());
+
+ NF->copyAttributesFrom(&F);
+ NF->copyMetadata(&F, 0);
+ NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat);
+
+ F.getParent()->getFunctionList().insert(F.getIterator(), NF);
+ NF->takeName(&F);
+ NF->splice(NF->begin(), &F);
+
+ Function::arg_iterator NFArg = NF->arg_begin();
+ for (Argument &Arg : F.args()) {
+ Arg.replaceAllUsesWith(&*NFArg);
+ NFArg->takeName(&Arg);
+ ++NFArg;
+ }
+
+ AttrBuilder AB(Ctx);
+ AB.addAttribute(Attribute::InReg);
+ AB.addAttribute("amdgpu-hidden-argument");
+ AttributeList AL = NF->getAttributes();
+ for (unsigned I = 0; I <= LastPreloadIndex; ++I) {
+ AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB);
+ NFArg++->setName(getHiddenArgName(HiddenArg(I)));
+ }
+
+ NF->setAttributes(AL);
+ F.replaceAllUsesWith(NF);
+
+ return NF;
+ }
+
+public:
+ PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
+ setInitialFreeUserSGPRsCount();
+ }
+
+ // Returns the maximum number of user SGPRs that we have available to preload
+ // arguments.
+ void setInitialFreeUserSGPRsCount() {
+ GCNUserSGPRUsageInfo UserSGPRInfo(F, ST);
+ NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs();
+ }
+
+ bool canPreloadKernArgAtOffset(uint64_t ExplicitArgOffset) {
+ return ExplicitArgOffset <= NumFreeUserSGPRs * 4;
+ }
+
+ // Try to allocate SGPRs to preload hidden kernel arguments.
+ void
+ tryAllocHiddenArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
+ SmallVectorImpl<Function *> &FunctionsToErase) {
+ Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists(
+ F.getParent(), Intrinsic::amdgcn_implicitarg_ptr);
+ if (!ImplicitArgPtr)
+ return;
+
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ // Pair is the load and the load offset.
+ SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
+ for (auto *U : ImplicitArgPtr->users()) {
+ Instruction *CI = dyn_cast<Instruction>(U);
+ if (!CI || CI->getParent()->getParent() != &F)
+ continue;
+
+ for (auto *U : CI->users()) {
+ int64_t Offset = 0;
+ auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
+ if (!Load) {
+ if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
+ continue;
+
+ Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
+ }
+
+ if (!Load || !Load->isSimple())
+ continue;
+
+ // FIXME: Expand handle merged loads.
+ LLVMContext &Ctx = F.getParent()->getContext();
+ Type *LoadTy = Load->getType();
+ HiddenArg HA = getHiddenArgFromOffset(Offset);
+ if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))
+ continue;
+
+ ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
+ }
+ }
+
+ if (ImplicitArgLoads.empty())
+ return;
+
+ // Allocate loads in order of offset. We need to be sure that the implicit
+ // argument can actually be preloaded.
+ std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second());
+
+ // If we fail to preload any implicit argument we know we don't have SGPRs
+ // to preload any subsequent ones with larger offsets. Find the first
+ // argument that we cannot preload.
+ auto *PreloadEnd =
+ std::find_if(ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
+ [&](const std::pair<LoadInst *, unsigned> &Load) {
+ unsigned LoadSize =
+ DL.getTypeStoreSize(Load.first->getType());
+ unsigned LoadOffset = Load.second;
+ if (!canPreloadKernArgAtOffset(LoadOffset + LoadSize +
+ ImplicitArgsBaseOffset))
+ return true;
+
+ return false;
+ });
+
+ if (PreloadEnd == ImplicitArgLoads.begin())
+ return;
+
+ unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second);
+ Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);
+ assert(NF);
+ FunctionsToErase.push_back(&F);
+ for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) {
+ LoadInst *LoadInst = I->first;
+ unsigned LoadOffset = I->second;
+ unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset);
+ unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;
+ Argument *Arg = NF->getArg(Index);
+ LoadInst->replaceAllUsesWith(Arg);
+ }
+ }
+};
+
+} // end anonymous namespace
+
+char AMDGPUPreloadKernelArgumentsLegacy::ID = 0;
+
+INITIALIZE_PASS(AMDGPUPreloadKernelArgumentsLegacy, DEBUG_TYPE,
+ "AMDGPU Preload Kernel Arguments", false, false)
+
+ModulePass *
+llvm::createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *TM) {
+ return new AMDGPUPreloadKernelArgumentsLegacy(
+ static_cast<const AMDGPUTargetMachine *>(TM));
+}
+
+AMDGPUPreloadKernelArgumentsLegacy::AMDGPUPreloadKernelArgumentsLegacy(
+ const AMDGPUTargetMachine *TM)
+ : ModulePass(ID), TM(TM) {}
+
+static bool markKernelArgsAsInreg(Module &M, const AMDGPUTargetMachine &TM) {
+ SmallVector<Function *, 4> FunctionsToErase;
+ bool Changed = false;
+ for (auto &F : M) {
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+ if (!ST.hasKernargPreload() ||
+ F.getCallingConv() != CallingConv::AMDGPU_KERNEL)
+ continue;
+
+ PreloadKernelArgInfo PreloadInfo(F, ST);
+ uint64_t ExplicitArgOffset = 0;
+ const DataLayout &DL = F.getDataLayout();
+ const uint64_t BaseOffset = ST.getExplicitKernelArgOffset();
+ unsigned NumPreloadsRequested = KernargPreloadCount;
+ unsigned NumPreloadedExplicitArgs = 0;
+ for (Argument &Arg : F.args()) {
+ // Avoid incompatible attributes and guard against running this pass
+ // twice.
+ //
+ // TODO: Preload byref kernel arguments
+ if (Arg.hasByRefAttr() || Arg.hasNestAttr() ||
+ Arg.hasAttribute("amdgpu-hidden-argument"))
+ break;
+
+ // Inreg may be pre-existing on some arguments, try to preload these.
+ if (NumPreloadsRequested == 0 && !Arg.hasInRegAttr())
+ break;
+
+ // FIXME: Preload aggregates.
+ if (Arg.getType()->isAggregateType())
+ break;
+
+ Type *ArgTy = Arg.getType();
+ Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
+ uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
+ ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
+
+ if (!PreloadInfo.canPreloadKernArgAtOffset(ExplicitArgOffset))
+ break;
+
+ Arg.addAttr(Attribute::InReg);
+ NumPreloadedExplicitArgs++;
+ if (NumPreloadsRequested > 0)
+ NumPreloadsRequested--;
+ }
+
+ // Only try preloading hidden arguments if we can successfully preload the
+ // last explicit argument.
+ if (NumPreloadedExplicitArgs == F.arg_size()) {
+ uint64_t ImplicitArgsBaseOffset =
+ alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
+ BaseOffset;
+ PreloadInfo.tryAllocHiddenArgPreloadSGPRs(ImplicitArgsBaseOffset,
+ FunctionsToErase);
+ }
+
+ Changed |= NumPreloadedExplicitArgs > 0;
+ }
+
+ // Erase cloned functions if we needed to update the kernel signature to
+ // support preloading hidden kernel arguments.
+ for (auto *F : FunctionsToErase)
+ F->eraseFromParent();
+
+ return Changed;
+}
+
+bool AMDGPUPreloadKernelArgumentsLegacy::runOnModule(Module &M) {
+ if (skipModule(M) || !TM)
+ return false;
+
+ return markKernelArgsAsInreg(M, *TM);
+}
+
+PreservedAnalyses
+AMDGPUPreloadKernelArgumentsPass::run(Module &M, ModuleAnalysisManager &AM) {
+ bool Changed = markKernelArgsAsInreg(M, TM);
+ return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
----------------
shiltian wrote:
empty line
https://github.com/llvm/llvm-project/pull/130434
More information about the llvm-commits
mailing list