[llvm] [AMDGPU] Add support for preloading implicit kernel arguments (PR #83817)

Austin Kerbow via llvm-commits llvm-commits at lists.llvm.org
Sun May 5 22:55:13 PDT 2024

@@ -64,6 +64,86 @@ class PreloadKernelArgInfo {
     NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
     return true;
+  // Try to allocate SGPRs to preload implicit kernel arguments.
+  void tryAllocImplicitArgPreloadSGPRs(unsigned ImplicitArgsBaseOffset,
+                                       IRBuilder<> &Builder) {
+    IntrinsicInst *ImplicitArgPtr = nullptr;
+    for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) {
+      for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ++I) {
+        if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(I))
+          if (CI->getIntrinsicID() == Intrinsic::amdgcn_implicitarg_ptr) {
+            ImplicitArgPtr = CI;
+            break;
+          }
+      }
+    }
+    if (!ImplicitArgPtr)
+      return;
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    // Pair is the load and the load offset.
+    SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
+    for (auto *U : ImplicitArgPtr->users()) {
+      if (!U->hasOneUse())
+        continue;
+      int64_t Offset = 0;
+      auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
+      if (!Load) {
+        if (GetPointerBaseWithConstantOffset(U, Offset, DL) != ImplicitArgPtr)
+          continue;
+        Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
+      }
+      if (!Load || !Load->isSimple())
+        continue;
+      // FIXME: Expand to handle 64-bit implicit args and large merged loads.
+      if (Load->getType() != Builder.getInt16Ty() &&
+          Load->getType() != Builder.getInt32Ty())
+        continue;
+      ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
+    }
+    if (ImplicitArgLoads.empty())
+      return;
+    // Allocate loads in order of offset. We need to be sure that the implicit
+    // argument can actually be preloaded.
+    std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
+              [](const std::pair<LoadInst *, unsigned> &A,
+                 const std::pair<LoadInst *, unsigned> &B) {
+                return A.second < B.second;
+              });
+    unsigned LastExplicitArgOffset = ImplicitArgsBaseOffset;
+    bool HasPreloadImplicitArgs = false;
+    for (const auto &Load : ImplicitArgLoads) {
+      LoadInst *LoadInst = Load.first;
+      Type *LoadType = LoadInst->getType();
+      auto LoadOffset = Load.second;
+      unsigned LoadSize = DL.getTypeStoreSize(LoadType);
+      // If we fail to preload any implicit argument we know we don't have SGPRs
+      // to preload any subsequent ones with larger offsets.
+      if (!tryAllocPreloadSGPRs(LoadSize, LoadOffset + ImplicitArgsBaseOffset,
+                                LastExplicitArgOffset))
+        break;
+      HasPreloadImplicitArgs = true;
+      LastExplicitArgOffset = LoadOffset + LoadSize;
+      llvm::Value *LoadOffsetValue =
+          llvm::ConstantInt::get(Builder.getInt32Ty(), LoadOffset);
+      CallInst *PreloadIntrin = Builder.CreateIntrinsic(
+          Intrinsic::amdgcn_preload_implicitarg, {LoadType}, {LoadOffsetValue});
+      LoadInst->replaceAllUsesWith(PreloadIntrin);
+    }
+    if (HasPreloadImplicitArgs)
+      F.addFnAttr("amdgpu-preload-implicitargs");
kerbowa wrote:

This was added to avoid searching for these intrinsic every time when allocating user SGPRs, which happens right before isel. It could be avoided if we either add the intrinsics later (when allocating user SGPRs) or if I just get rid of the intrinsic entirely. I commented on the later option below.


More information about the llvm-commits mailing list