[llvm] [AMDGPU] Update code object metadata for kernarg preload (PR #134666)

via llvm-commits llvm-commits at lists.llvm.org
Mon Apr 7 08:08:30 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-llvm-support

Author: Austin Kerbow (kerbowa)

<details>
<summary>Changes</summary>

Tracks the registers that explicit and hidden arguments are preloaded to
with new code object metadata.

IR arguments may be split across multiple parts by isel, and SGPR tuple
alignment means that an argument may be spread across multiple
registers.

To support this, some of the utilities for hidden kernel arguments are
moved to `AMDGPUArgumentUsageInfo.h`. Additional bookkeeping is also
needed for tracking purposes.

---

Patch is 78.25 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134666.diff


21 Files Affected:

- (modified) llvm/include/llvm/Support/AMDGPUMetadata.h (+1-1) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp (+34) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h (+87-4) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp (+312-59) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h (+26-8) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp (+10-59) 
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+39-7) 
- (modified) llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp (+8-3) 
- (modified) llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h (+2-2) 
- (added) llvm/test/CodeGen/AMDGPU/hsa-metadata-preload-args-v6.ll (+388) 
- (modified) llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-any.ll (+4-3) 
- (modified) llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-not-supported.ll (+4-3) 
- (modified) llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-off.ll (+4-3) 
- (modified) llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-on.ll (+4-3) 
- (modified) llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-1.ll (+4-3) 
- (modified) llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-2.ll (+4-3) 
- (modified) llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-1.ll (+4-3) 
- (modified) llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-2.ll (+4-3) 
- (modified) llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-not-supported.ll (+4-3) 
- (modified) llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-off.ll (+4-3) 
- (modified) llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-on.ll (+4-3) 


``````````diff
diff --git a/llvm/include/llvm/Support/AMDGPUMetadata.h b/llvm/include/llvm/Support/AMDGPUMetadata.h
index 76ac7ab74a32e..d5e0f4031b0f6 100644
--- a/llvm/include/llvm/Support/AMDGPUMetadata.h
+++ b/llvm/include/llvm/Support/AMDGPUMetadata.h
@@ -47,7 +47,7 @@ constexpr uint32_t VersionMinorV5 = 2;
 /// HSA metadata major version for code object V6.
 constexpr uint32_t VersionMajorV6 = 1;
 /// HSA metadata minor version for code object V6.
-constexpr uint32_t VersionMinorV6 = 2;
+constexpr uint32_t VersionMinorV6 = 3;
 
 /// Old HSA metadata beginning assembler directive for V2. This is only used for
 /// diagnostics now.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index d158f0f58d711..06504a081e6f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -16,12 +16,15 @@
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
+using namespace llvm::KernArgPreload;
 
 #define DEBUG_TYPE "amdgpu-argument-reg-usage-info"
 
 INITIALIZE_PASS(AMDGPUArgumentUsageInfo, DEBUG_TYPE,
                 "Argument Register Usage Information Storage", false, true)
 
+constexpr HiddenArgInfo HiddenArgUtils::HiddenArgs[END_HIDDEN_ARGS];
+
 void ArgDescriptor::print(raw_ostream &OS,
                           const TargetRegisterInfo *TRI) const {
   if (!isSet()) {
@@ -176,6 +179,37 @@ AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() {
   return AI;
 }
 
+SmallVector<const KernArgPreloadDescriptor *, 4>
+AMDGPUFunctionArgInfo::getPreloadDescriptorsForArgIdx(unsigned ArgIdx) const {
+  SmallVector<const KernArgPreloadDescriptor *, 4> Results;
+  for (const auto &KV : PreloadKernArgs) {
+    if (KV.second.OrigArgIdx == ArgIdx)
+      Results.push_back(&KV.second);
+  }
+
+  llvm::stable_sort(Results, [](const KernArgPreloadDescriptor *A,
+                                const KernArgPreloadDescriptor *B) {
+    return A->PartIdx < B->PartIdx;
+  });
+
+  return Results;
+}
+
+std::optional<const KernArgPreloadDescriptor *>
+AMDGPUFunctionArgInfo::getHiddenArgPreloadDescriptor(HiddenArg HA) const {
+  assert(HA < END_HIDDEN_ARGS);
+
+  auto HiddenArgIt = PreloadHiddenArgsIndexMap.find(HA);
+  if (HiddenArgIt == PreloadHiddenArgsIndexMap.end())
+    return std::nullopt;
+
+  auto KernArgIt = PreloadKernArgs.find(HiddenArgIt->second);
+  if (KernArgIt == PreloadKernArgs.end())
+    return std::nullopt;
+
+  return &KernArgIt->second;
+}
+
 const AMDGPUFunctionArgInfo &
 AMDGPUArgumentUsageInfo::lookupFuncArgInfo(const Function &F) const {
   auto I = ArgInfoMap.find(&F);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index e07d47381ecca..ee4dba31f2617 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -11,7 +11,10 @@
 
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/Register.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
 
 namespace llvm {
@@ -95,11 +98,78 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
   return OS;
 }
 
-struct KernArgPreloadDescriptor : public ArgDescriptor {
-  KernArgPreloadDescriptor() {}
-  SmallVector<MCRegister> Regs;
+namespace KernArgPreload {
+
+enum HiddenArg {
+  HIDDEN_BLOCK_COUNT_X,
+  HIDDEN_BLOCK_COUNT_Y,
+  HIDDEN_BLOCK_COUNT_Z,
+  HIDDEN_GROUP_SIZE_X,
+  HIDDEN_GROUP_SIZE_Y,
+  HIDDEN_GROUP_SIZE_Z,
+  HIDDEN_REMAINDER_X,
+  HIDDEN_REMAINDER_Y,
+  HIDDEN_REMAINDER_Z,
+  END_HIDDEN_ARGS
 };
 
+// Stores information about a specific hidden argument.
+struct HiddenArgInfo {
+  // Offset in bytes from the location in the kernearg segment pointed to by
+  // the implicitarg pointer.
+  uint8_t Offset;
+  // The size of the hidden argument in bytes.
+  uint8_t Size;
+  // The name of the hidden argument in the kernel signature.
+  const char *Name;
+};
+
+struct HiddenArgUtils {
+  static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
+      {0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},
+      {8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},
+      {14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},
+      {18, 2, "_hidden_remainder_x"},  {20, 2, "_hidden_remainder_y"},
+      {22, 2, "_hidden_remainder_z"}};
+
+  static HiddenArg getHiddenArgFromOffset(unsigned Offset) {
+    for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
+      if (HiddenArgs[I].Offset == Offset)
+        return static_cast<HiddenArg>(I);
+
+    return END_HIDDEN_ARGS;
+  }
+
+  static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
+    if (HA < END_HIDDEN_ARGS)
+      return static_cast<Type *>(Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8));
+
+    llvm_unreachable("Unexpected hidden argument.");
+  }
+
+  static const char *getHiddenArgName(HiddenArg HA) {
+    if (HA < END_HIDDEN_ARGS) {
+      return HiddenArgs[HA].Name;
+    }
+    llvm_unreachable("Unexpected hidden argument.");
+  }
+};
+
+struct KernArgPreloadDescriptor {
+  // Id of the original argument in the IR kernel function argument list.
+  unsigned OrigArgIdx = 0;
+
+  // If this IR argument was split into multiple parts, this is the index of the
+  // part in the original argument.
+  unsigned PartIdx = 0;
+
+  // The registers that the argument is preloaded into. The argument may be
+  // split accross multilpe registers.
+  SmallVector<MCRegister, 2> Regs;
+};
+
+} // namespace KernArgPreload
+
 struct AMDGPUFunctionArgInfo {
   // clang-format off
   enum PreloadedValue {
@@ -161,7 +231,10 @@ struct AMDGPUFunctionArgInfo {
   ArgDescriptor WorkItemIDZ;
 
   // Map the index of preloaded kernel arguments to its descriptor.
-  SmallDenseMap<int, KernArgPreloadDescriptor> PreloadKernArgs{};
+  SmallDenseMap<int, KernArgPreload::KernArgPreloadDescriptor>
+      PreloadKernArgs{};
+  // Map hidden argument to the index of it's descriptor.
+  SmallDenseMap<KernArgPreload::HiddenArg, int> PreloadHiddenArgsIndexMap{};
   // The first user SGPR allocated for kernarg preloading.
   Register FirstKernArgPreloadReg;
 
@@ -169,6 +242,16 @@ struct AMDGPUFunctionArgInfo {
   getPreloadedValue(PreloadedValue Value) const;
 
   static AMDGPUFunctionArgInfo fixedABILayout();
+
+  // Returns preload argument descriptors for an IR argument index. Isel may
+  // split IR arguments into multiple parts, the return vector holds all parts
+  // associated with an IR argument in the kernel signature.
+  SmallVector<const KernArgPreload::KernArgPreloadDescriptor *, 4>
+  getPreloadDescriptorsForArgIdx(unsigned ArgIdx) const;
+
+  // Returns the hidden arguments `KernArgPreloadDescriptor` if it is preloaded.
+  std::optional<const KernArgPreload::KernArgPreloadDescriptor *>
+  getHiddenArgPreloadDescriptor(KernArgPreload::HiddenArg HA) const;
 };
 
 class AMDGPUArgumentUsageInfo : public ImmutablePass {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 2991778a1bbc7..f6f71b2d042d3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -15,6 +15,7 @@
 #include "AMDGPUHSAMetadataStreamer.h"
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUInstPrinter.h"
 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIProgramInfo.h"
@@ -290,7 +291,7 @@ void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF,
     if (Arg.hasAttribute("amdgpu-hidden-argument"))
       continue;
 
-    emitKernelArg(Arg, Offset, Args);
+    emitKernelArg(Arg, Offset, Args, MF);
   }
 
   emitHiddenKernelArgs(MF, Offset, Args);
@@ -300,7 +301,8 @@ void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF,
 
 void MetadataStreamerMsgPackV4::emitKernelArg(const Argument &Arg,
                                               unsigned &Offset,
-                                              msgpack::ArrayDocNode Args) {
+                                              msgpack::ArrayDocNode Args,
+                                              const MachineFunction &MF) {
   const auto *Func = Arg.getParent();
   auto ArgNo = Arg.getArgNo();
   const MDNode *Node;
@@ -357,17 +359,18 @@ void MetadataStreamerMsgPackV4::emitKernelArg(const Argument &Arg,
   Align ArgAlign;
   std::tie(ArgTy, ArgAlign) = getArgumentTypeAlign(Arg, DL);
 
-  emitKernelArg(DL, ArgTy, ArgAlign,
-                getValueKind(ArgTy, TypeQual, BaseTypeName), Offset, Args,
-                PointeeAlign, Name, TypeName, BaseTypeName, ActAccQual,
-                AccQual, TypeQual);
+  emitKernelArgImpl(DL, ArgTy, ArgAlign,
+                    getValueKind(ArgTy, TypeQual, BaseTypeName), Offset, Args,
+                    "" /* PreloadRegisters */, PointeeAlign, Name, TypeName,
+                    BaseTypeName, ActAccQual, AccQual, TypeQual);
 }
 
-void MetadataStreamerMsgPackV4::emitKernelArg(
+void MetadataStreamerMsgPackV4::emitKernelArgImpl(
     const DataLayout &DL, Type *Ty, Align Alignment, StringRef ValueKind,
-    unsigned &Offset, msgpack::ArrayDocNode Args, MaybeAlign PointeeAlign,
-    StringRef Name, StringRef TypeName, StringRef BaseTypeName,
-    StringRef ActAccQual, StringRef AccQual, StringRef TypeQual) {
+    unsigned &Offset, msgpack::ArrayDocNode Args, StringRef PreloadRegisters,
+    MaybeAlign PointeeAlign, StringRef Name, StringRef TypeName,
+    StringRef BaseTypeName, StringRef ActAccQual, StringRef AccQual,
+    StringRef TypeQual) {
   auto Arg = Args.getDocument()->getMapNode();
 
   if (!Name.empty())
@@ -409,6 +412,11 @@ void MetadataStreamerMsgPackV4::emitKernelArg(
       Arg[".is_pipe"] = Arg.getDocument()->getNode(true);
   }
 
+  if (!PreloadRegisters.empty()) {
+    Arg[".preload_registers"] =
+        Arg.getDocument()->getNode(PreloadRegisters, /*Copy=*/true);
+  }
+
   Args.push_back(Arg);
 }
 
@@ -428,14 +436,14 @@ void MetadataStreamerMsgPackV4::emitHiddenKernelArgs(
   Offset = alignTo(Offset, ST.getAlignmentForImplicitArgPtr());
 
   if (HiddenArgNumBytes >= 8)
-    emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset,
-                  Args);
+    emitKernelArgImpl(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset,
+                      Args);
   if (HiddenArgNumBytes >= 16)
-    emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_y", Offset,
-                  Args);
+    emitKernelArgImpl(DL, Int64Ty, Align(8), "hidden_global_offset_y", Offset,
+                      Args);
   if (HiddenArgNumBytes >= 24)
-    emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_z", Offset,
-                  Args);
+    emitKernelArgImpl(DL, Int64Ty, Align(8), "hidden_global_offset_z", Offset,
+                      Args);
 
   auto *Int8PtrTy =
       PointerType::get(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS);
@@ -445,42 +453,42 @@ void MetadataStreamerMsgPackV4::emitHiddenKernelArgs(
     // before code object V5, which makes the mutual exclusion between the
     // "printf buffer" and "hostcall buffer" here sound.
     if (M->getNamedMetadata("llvm.printf.fmts"))
-      emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset,
-                    Args);
+      emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset,
+                        Args);
     else if (!Func.hasFnAttribute("amdgpu-no-hostcall-ptr"))
-      emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset,
-                    Args);
+      emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer",
+                        Offset, Args);
     else
-      emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
+      emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
   }
 
   // Emit "default queue" and "completion action" arguments if enqueue kernel is
   // used, otherwise emit dummy "none" arguments.
   if (HiddenArgNumBytes >= 40) {
     if (!Func.hasFnAttribute("amdgpu-no-default-queue")) {
-      emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset,
-                    Args);
+      emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset,
+                        Args);
     } else {
-      emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
+      emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
     }
   }
 
   if (HiddenArgNumBytes >= 48) {
     if (!Func.hasFnAttribute("amdgpu-no-completion-action")) {
-      emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_completion_action", Offset,
-                    Args);
+      emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_completion_action",
+                        Offset, Args);
     } else {
-      emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
+      emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
     }
   }
 
   // Emit the pointer argument for multi-grid object.
   if (HiddenArgNumBytes >= 56) {
     if (!Func.hasFnAttribute("amdgpu-no-multigrid-sync-arg")) {
-      emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset,
-                    Args);
+      emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg",
+                        Offset, Args);
     } else {
-      emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
+      emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
     }
   }
 }
@@ -635,77 +643,83 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs(
   auto *Int16Ty = Type::getInt16Ty(Func.getContext());
 
   Offset = alignTo(Offset, ST.getAlignmentForImplicitArgPtr());
-  emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_x", Offset, Args);
-  emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_y", Offset, Args);
-  emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_z", Offset, Args);
+  emitKernelArgImpl(DL, Int32Ty, Align(4), "hidden_block_count_x", Offset,
+                    Args);
+  emitKernelArgImpl(DL, Int32Ty, Align(4), "hidden_block_count_y", Offset,
+                    Args);
+  emitKernelArgImpl(DL, Int32Ty, Align(4), "hidden_block_count_z", Offset,
+                    Args);
 
-  emitKernelArg(DL, Int16Ty, Align(2), "hidden_group_size_x", Offset, Args);
-  emitKernelArg(DL, Int16Ty, Align(2), "hidden_group_size_y", Offset, Args);
-  emitKernelArg(DL, Int16Ty, Align(2), "hidden_group_size_z", Offset, Args);
+  emitKernelArgImpl(DL, Int16Ty, Align(2), "hidden_group_size_x", Offset, Args);
+  emitKernelArgImpl(DL, Int16Ty, Align(2), "hidden_group_size_y", Offset, Args);
+  emitKernelArgImpl(DL, Int16Ty, Align(2), "hidden_group_size_z", Offset, Args);
 
-  emitKernelArg(DL, Int16Ty, Align(2), "hidden_remainder_x", Offset, Args);
-  emitKernelArg(DL, Int16Ty, Align(2), "hidden_remainder_y", Offset, Args);
-  emitKernelArg(DL, Int16Ty, Align(2), "hidden_remainder_z", Offset, Args);
+  emitKernelArgImpl(DL, Int16Ty, Align(2), "hidden_remainder_x", Offset, Args);
+  emitKernelArgImpl(DL, Int16Ty, Align(2), "hidden_remainder_y", Offset, Args);
+  emitKernelArgImpl(DL, Int16Ty, Align(2), "hidden_remainder_z", Offset, Args);
 
   // Reserved for hidden_tool_correlation_id.
   Offset += 8;
 
   Offset += 8; // Reserved.
 
-  emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset, Args);
-  emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_y", Offset, Args);
-  emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_z", Offset, Args);
+  emitKernelArgImpl(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset,
+                    Args);
+  emitKernelArgImpl(DL, Int64Ty, Align(8), "hidden_global_offset_y", Offset,
+                    Args);
+  emitKernelArgImpl(DL, Int64Ty, Align(8), "hidden_global_offset_z", Offset,
+                    Args);
 
-  emitKernelArg(DL, Int16Ty, Align(2), "hidden_grid_dims", Offset, Args);
+  emitKernelArgImpl(DL, Int16Ty, Align(2), "hidden_grid_dims", Offset, Args);
 
   Offset += 6; // Reserved.
   auto *Int8PtrTy =
       PointerType::get(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS);
 
   if (M->getNamedMetadata("llvm.printf.fmts")) {
-    emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset,
-                  Args);
+    emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset,
+                      Args);
   } else {
     Offset += 8; // Skipped.
   }
 
   if (!Func.hasFnAttribute("amdgpu-no-hostcall-ptr")) {
-    emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset,
-                  Args);
+    emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset,
+                      Args);
   } else {
     Offset += 8; // Skipped.
   }
 
   if (!Func.hasFnAttribute("amdgpu-no-multigrid-sync-arg")) {
-    emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset,
-                Args);
+    emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg",
+                      Offset, Args);
   } else {
     Offset += 8; // Skipped.
   }
 
   if (!Func.hasFnAttribute("amdgpu-no-heap-ptr"))
-    emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_heap_v1", Offset, Args);
+    emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_heap_v1", Offset, Args);
   else
     Offset += 8; // Skipped.
 
   if (!Func.hasFnAttribute("amdgpu-no-default-queue")) {
-    emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset,
-                  Args);
+    emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset,
+                      Args);
   } else {
     Offset += 8; // Skipped.
   }
 
   if (!Func.hasFnAttribute("amdgpu-no-completion-action")) {
-    emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_completion_action", Offset,
-                  Args);
+    emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_completion_action",
+                      Offset, Args);
   } else {
     Offset += 8; // Skipped.
   }
 
   // Emit argument for hidden dynamic lds size
   if (MFI.isDynamicLDSUsed()) {
-    emitKernelArg(DL, Int32Ty, Align(4), "hidden_dynamic_lds_size", Offset,
-                  Args);
+    emitKernelArgImpl(DL, Int32Ty, Align(4), "hidden_dynamic_lds_size", Offset,
+                      Args);
   } else {
     Offset += 4; // skipped
   }
@@ -715,14 +729,17 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs(
   // hidden_private_base and hidden_shared_base are only when the subtarget has
   // ApertureRegs.
   if (!ST.hasApertureRegs()) {
-    emitKernelArg(DL, Int32Ty, Align(4), "hidden_private_base", Offset, Args);
-    emitKernelArg(DL, Int32Ty, Align(4), "hidden_shared_base", Offset, Args);
+    emitKernelArgImpl(DL, Int32Ty, Align(4), "hidden_private_base", Offset,
+                      Args);
+    emitKernelArgImpl(DL, Int32Ty, Align(4), "hidden_shared_base", Offset,
+                      Args);
   } else {
     Offset += 8; // Skipped.
   }
 
   if (MFI.getUserSGPRInfo().hasQueuePtr())
-    emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_queue_ptr", Offset, Args);
+    emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_queue_ptr", Offset,
+                      Args);
 }
 
 void MetadataStreamerMsgPackV5::emitKernelAttrs(const AMDGPUTargetMachine &TM,
@@ -745,5 +762,241 @@ void MetadataStreamerMsgPackV6::emitVersion() {
   getRootMetadata("amdhsa.version") = Version;
 }
 
+void MetadataStreamerMsgPackV6::emitHiddenKernelArgWithPreload(
+    const DataLayout &DL, Type *ArgTy, Align Alignment,
+    KernArgPreload::HiddenArg HiddenArg, StringRef ArgName, unsigned &Offset,
+    msgpack::ArrayDocNode Args, const AMDGPUFunctionArgInfo &ArgInfo) {
+
+  SmallString<16> PreloadStr;
+  auto PreloadDesc = ArgInfo.getHiddenArgPreloadDescriptor(HiddenArg);
+  if (PreloadDesc) {
+    const auto &Regs = (*PreloadDesc)->Regs;
+    for (unsigned I = 0; I < Regs.size(); ++I) {
+      if (I > 0)
+        PreloadStr += " ";
+      PreloadStr += AMDGPUInstPrinter::getRegisterName(Regs[I]);
+    }
+  }
+  emitKernelArgImpl(DL, ArgTy, Alignment, ArgName, Offset, Args, PreloadStr);
+}
+
+void MetadataStreamerMsgPackV6::emitHiddenKernelArgs(
+    const MachineFunction &MF, unsigne...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/134666


More information about the llvm-commits mailing list