[llvm] [AMDGPU] Add support for preloading implicit kernel arguments (PR #83817)

Fri Apr 26 05:37:48 PDT 2024

================
@@ -64,6 +64,86 @@ class PreloadKernelArgInfo {
     NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
     return true;
   }
+
+  // Try to allocate SGPRs to preload implicit kernel arguments.
+  void tryAllocImplicitArgPreloadSGPRs(unsigned ImplicitArgsBaseOffset,
+                                       IRBuilder<> &Builder) {
+    IntrinsicInst *ImplicitArgPtr = nullptr;
+    for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) {
+      for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ++I) {
+        if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(I))
+          if (CI->getIntrinsicID() == Intrinsic::amdgcn_implicitarg_ptr) {
+            ImplicitArgPtr = CI;
+            break;
+          }
+      }
+    }
+
+    if (!ImplicitArgPtr)
+      return;
+
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    // Pair is the load and the load offset.
+    SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
+    for (auto *U : ImplicitArgPtr->users()) {
+      if (!U->hasOneUse())
+        continue;
+
+      int64_t Offset = 0;
+      auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
+      if (!Load) {
+        if (GetPointerBaseWithConstantOffset(U, Offset, DL) != ImplicitArgPtr)
+          continue;
+        Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
+      }
+
+      if (!Load || !Load->isSimple())
+        continue;
+
+      // FIXME: Expand to handle 64-bit implicit args and large merged loads.
+      if (Load->getType() != Builder.getInt16Ty() &&
+          Load->getType() != Builder.getInt32Ty())
+        continue;
+
+      ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
+    }
+
+    if (ImplicitArgLoads.empty())
+      return;
+
+    // Allocate loads in order of offset. We need to be sure that the implicit
+    // argument can actually be preloaded.
+    std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
+              [](const std::pair<LoadInst *, unsigned> &A,
+                 const std::pair<LoadInst *, unsigned> &B) {
+                return A.second < B.second;
+              });
+
+    unsigned LastExplicitArgOffset = ImplicitArgsBaseOffset;
+    bool HasPreloadImplicitArgs = false;
+    for (const auto &Load : ImplicitArgLoads) {
+      LoadInst *LoadInst = Load.first;
+      Type *LoadType = LoadInst->getType();
+      auto LoadOffset = Load.second;
+      unsigned LoadSize = DL.getTypeStoreSize(LoadType);
+      // If we fail to preload any implicit argument we know we don't have SGPRs
+      // to preload any subsequent ones with larger offsets.
+      if (!tryAllocPreloadSGPRs(LoadSize, LoadOffset + ImplicitArgsBaseOffset,
+                                LastExplicitArgOffset))
+        break;
+
+      HasPreloadImplicitArgs = true;
+      LastExplicitArgOffset = LoadOffset + LoadSize;
+      llvm::Value *LoadOffsetValue =
+          llvm::ConstantInt::get(Builder.getInt32Ty(), LoadOffset);
+      CallInst *PreloadIntrin = Builder.CreateIntrinsic(
+          Intrinsic::amdgcn_preload_implicitarg, {LoadType}, {LoadOffsetValue});
+      LoadInst->replaceAllUsesWith(PreloadIntrin);
+    }
+
+    if (HasPreloadImplicitArgs)
+      F.addFnAttr("amdgpu-preload-implicitargs");
----------------
arsenm wrote:

Can you avoid adding this attribute? 

https://github.com/llvm/llvm-project/pull/83817