[llvm] [AMDGPU] Split struct kernel arguments (PR #133786)

Sat Jun 14 05:51:27 PDT 2025

================
@@ -0,0 +1,328 @@
+//===--- AMDGPUSplitKernelArguments.cpp - Split kernel arguments ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file This pass flats struct-type kernel arguments. It eliminates unused
+// fields and only keeps used fields. The objective is to facilitate preloading
+// of kernel arguments by later passes.
+//
+//===----------------------------------------------------------------------===//
+#include "AMDGPU.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+#define DEBUG_TYPE "amdgpu-split-kernel-arguments"
+
+using namespace llvm;
+
+namespace {
+static cl::opt<bool> EnableSplitKernelArgs(
+    "amdgpu-enable-split-kernel-args",
+    cl::desc("Enable splitting of AMDGPU kernel arguments"), cl::init(false));
+
+class AMDGPUSplitKernelArguments : public ModulePass {
+public:
+  static char ID;
+
+  AMDGPUSplitKernelArguments() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+  }
+
+private:
+  bool processFunction(Function &F);
+};
+
+} // end anonymous namespace
+
+bool AMDGPUSplitKernelArguments::processFunction(Function &F) {
+  const DataLayout &DL = F.getParent()->getDataLayout();
+
+  SmallVector<std::tuple<unsigned, unsigned, uint64_t>, 8> NewArgMappings;
+  DenseMap<Argument *, SmallVector<LoadInst *, 8>> ArgToLoadsMap;
+  DenseMap<Argument *, SmallVector<GetElementPtrInst *, 8>> ArgToGEPsMap;
+  SmallVector<Argument *, 8> StructArgs;
+  SmallVector<Type *, 8> NewArgTypes;
+
+  // Collect struct arguments and new argument types
+  unsigned OriginalArgIndex = 0;
+  unsigned NewArgIndex = 0;
+  for (Argument &Arg : F.args()) {
+    if (Arg.use_empty()) {
+      NewArgTypes.push_back(Arg.getType());
+      NewArgMappings.push_back(
+          std::make_tuple(NewArgIndex, OriginalArgIndex, 0));
+      ++NewArgIndex;
+      ++OriginalArgIndex;
+      continue;
+    }
+
+    PointerType *PT = dyn_cast<PointerType>(Arg.getType());
+    if (!PT) {
+      NewArgTypes.push_back(Arg.getType());
+      // Include mapping if indices have changed
+      if (NewArgIndex != OriginalArgIndex)
+        NewArgMappings.push_back(
+            std::make_tuple(NewArgIndex, OriginalArgIndex, 0));
+      ++NewArgIndex;
+      ++OriginalArgIndex;
+      continue;
+    }
+
+    if (!Arg.hasByRefAttr()) {
+      NewArgTypes.push_back(Arg.getType());
+      // Include mapping if indices have changed
+      if (NewArgIndex != OriginalArgIndex)
+        NewArgMappings.push_back(
+            std::make_tuple(NewArgIndex, OriginalArgIndex, 0));
+      ++NewArgIndex;
+      ++OriginalArgIndex;
+      continue;
+    }
+
+    if (!isa<StructType>(Arg.getParamByRefType())) {
+      NewArgTypes.push_back(Arg.getType());
+      // Include mapping if indices have changed
+      if (NewArgIndex != OriginalArgIndex)
+        NewArgMappings.push_back(
+            std::make_tuple(NewArgIndex, OriginalArgIndex, 0));
+      ++NewArgIndex;
+      ++OriginalArgIndex;
+      continue;
+    }
+
+    bool AllLoadsOrGEPs = true;
+    SmallVector<LoadInst *, 8> Loads;
+    SmallVector<GetElementPtrInst *, 8> GEPs;
+    for (User *U : Arg.users()) {
+      LLVM_DEBUG(dbgs() << "  User: " << *U << "\n");
+      if (auto *LI = dyn_cast<LoadInst>(U)) {
+        Loads.push_back(LI);
+      } else if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
+        GEPs.push_back(GEP);
+        for (User *GEPUser : GEP->users()) {
+          LLVM_DEBUG(dbgs() << "    GEP User: " << *GEPUser << "\n");
+          if (auto *GEPLoad = dyn_cast<LoadInst>(GEPUser)) {
+            Loads.push_back(GEPLoad);
+          } else {
----------------
yxsamliu wrote:

By "nested GEP" I assume it refers to the case `arg -> GEP -> GEP -> load`.

The pass currently does not split kernel args accessed through nested GEP's. It just keep it as it is. However, since in most cases struct-type kernel arguments are accessed through single GEP even for nested struct types, this does not lose many performance opportunities.

This is because SROA can combine the nested GEP generated by clang if the indices are constant. Therefore even for nested struct-type arguments, accessing each data member results in a single GEP, e.g. https://godbolt.org/z/xKGreYM9e.

Nested GEP may happen if one of the index is not constant, e.g. accessing elements of array-type member by a non-constant index. However, that usually only happens to large array-type member, since small loops are usually unrolled. Such kernel args may not be preloaded due to their large sizes., therefore splitting them may not improve performance.

Therefore, for now we only consider splitting the cases of `arg -> load` and `arg -> GEP -> load`.

https://github.com/llvm/llvm-project/pull/133786