[llvm] [AMDGPU] Move kernarg preload logic to AMDGPU Attributor (PR #123547)

Fri Jan 31 09:27:17 PST 2025

================
@@ -1314,19 +1523,64 @@ struct AAAMDGPUNoAGPR
 
 const char AAAMDGPUNoAGPR::ID = 0;
 
-static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
-  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
-  for (unsigned I = 0;
-       I < F.arg_size() &&
-       I < std::min(KernargPreloadCount.getValue(), ST.getMaxNumUserSGPRs());
-       ++I) {
-    Argument &Arg = *F.getArg(I);
-    // Check for incompatible attributes.
-    if (Arg.hasByRefAttr() || Arg.hasNestAttr())
-      break;
+static void markKernelArgsAsInreg(SetVector<Function *> &Functions,
+                                  TargetMachine &TM) {
+  SmallVector<Function *, 4> FunctionsToErase;
+  for (auto *F : Functions) {
+    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*F);
+    if (!ST.hasKernargPreload() ||
+        F->getCallingConv() != CallingConv::AMDGPU_KERNEL || F->arg_empty())
+      continue;
+
+    PreloadKernelArgInfo PreloadInfo(*F, ST);
+    uint64_t ExplicitArgOffset = 0;
+    const DataLayout &DL = F->getDataLayout();
+    const uint64_t BaseOffset = ST.getExplicitKernelArgOffset();
+    unsigned NumPreloadsRequested = KernargPreloadCount;
+    unsigned NumPreloadedExplicitArgs = 0;
+    for (Argument &Arg : F->args()) {
+      // Avoid incompatible attributes and guard against running this pass
+      // twice.
+      if (Arg.hasByRefAttr() || Arg.hasNestAttr() ||
+          Arg.hasAttribute("amdgpu-hidden-argument"))
+        break;
+
+      // Inreg may be pre-existing on some arguments, try to preload these.
+      if (NumPreloadsRequested == 0 && !Arg.hasInRegAttr())
+        break;
+
+      // FIXME: Preload aggregates.
+      if (Arg.getType()->isAggregateType())
+        break;
+
+      Type *ArgTy = Arg.getType();
+      Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
+      uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
+      ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
----------------
kerbowa wrote:

I think the way I'm doing it is what we want since AMDGPUCallLowering::lowerFormalArgumentsKernel and HSA metadata streamer calculates the argument offsets this same way, so it should be consistent with how they are processed.

https://github.com/llvm/llvm-project/pull/123547