[llvm] f279c47 - AMDGPU gfx12: Add _dvgpr$ symbols for dynamic VGPRs (#148251)

Fri Aug 15 08:33:10 PDT 2025

Author: Tim Renouf
Date: 2025-08-15T16:33:06+01:00
New Revision: f279c47cb3e7191a22703b837e006eb7dd591de7

URL: https://github.com/llvm/llvm-project/commit/f279c47cb3e7191a22703b837e006eb7dd591de7
DIFF: https://github.com/llvm/llvm-project/commit/f279c47cb3e7191a22703b837e006eb7dd591de7.diff

LOG: AMDGPU gfx12: Add _dvgpr$ symbols for dynamic VGPRs (#148251)

For each function with the AMDGPU_CS_Chain calling convention, with
dynamic VGPRs enabled, add a _dvgpr$ symbol, with the value of the
function symbol, plus an offset encoding one less than the number of
VGPR blocks used by the function (16 VGPRs per block, no more than 128)
in bits 5..3 of the symbol value. This is used by a front-end to have
functions that are chained rather than called, and a dispatcher that
dynamically resizes the VGPR count before dispatching to a function.

Added: 
    llvm/test/CodeGen/AMDGPU/dvgpr_sym.ll
    llvm/test/CodeGen/AMDGPU/dvgpr_sym_fail_too_many_block_size_16.ll
    llvm/test/CodeGen/AMDGPU/dvgpr_sym_fail_too_many_block_size_16_anon.ll

Modified: 
    llvm/docs/AMDGPUUsage.rst
    llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
    llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h

Removed: 
    


################################################################################
diff  --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 8d0786ab0440d..ef2a98f09967c 100644

--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1771,6 +1771,10 @@ The AMDGPU backend supports the following LLVM IR attributes.
                                                       using dedicated instructions, but may not send the DEALLOC_VGPRS
                                                       message. If a shader has this attribute, then all its callees must
                                                       match its value.
+                                                      An amd_cs_chain CC function with this enabled has an extra symbol
+                                                      prefixed with "_dvgpr$" with the value of the function symbol,
+                                                      offset by one less than the number of dynamic VGPR blocks required
+                                                      by the function encoded in bits 5..3.
 
      ================================================ ==========================================================
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 188c126cb9fbe..66c3fad6de1a1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -41,6 +41,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/AMDHSAKernelDescriptor.h"
 #include "llvm/Support/Compiler.h"
@@ -733,6 +734,9 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
                      OutContext, IsLocal));
   }
 
+  // Emit _dvgpr$ symbol when appropriate.
+  emitDVgprSymbol(MF);
+
   if (isVerbose()) {
     MCSectionELF *CommentSection =
         Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
@@ -875,6 +879,49 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   return false;
 }
 
+// When appropriate, add a _dvgpr$ symbol, with the value of the function
+// symbol, plus an offset encoding one less than the number of VGPR blocks used
+// by the function in bits 5..3 of the symbol value. A "VGPR block" can be
+// either 16 VGPRs (for a max of 128), or 32 VGPRs (for a max of 256). This is
+// used by a front-end to have functions that are chained rather than called,
+// and a dispatcher that dynamically resizes the VGPR count before dispatching
+// to a function.
+void AMDGPUAsmPrinter::emitDVgprSymbol(MachineFunction &MF) {
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+  if (MFI.isDynamicVGPREnabled() &&
+      MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS_Chain) {
+    MCContext &Ctx = MF.getContext();
+    unsigned BlockSize = MFI.getDynamicVGPRBlockSize();
+    MCValue NumVGPRs;
+    if (!CurrentProgramInfo.NumVGPRsForWavesPerEU->evaluateAsRelocatable(
+            NumVGPRs, nullptr) ||
+        !NumVGPRs.isAbsolute()) {
+      llvm_unreachable("unable to resolve NumVGPRs for _dvgpr$ symbol");
+    }
+    // Calculate number of VGPR blocks.
+    // Treat 0 VGPRs as 1 VGPR to avoid underflowing.
+    unsigned NumBlocks =
+        divideCeil(std::max(unsigned(NumVGPRs.getConstant()), 1U), BlockSize);
+
+    if (NumBlocks > 8) {
+      OutContext.reportError({},
+                             "too many DVGPR blocks for _dvgpr$ symbol for '" +
+                                 Twine(CurrentFnSym->getName()) + "'");
+      return;
+    }
+    unsigned EncodedNumBlocks = (NumBlocks - 1) << 3;
+    // Add to function symbol to create _dvgpr$ symbol.
+    const MCExpr *DVgprFuncVal = MCBinaryExpr::createAdd(
+        MCSymbolRefExpr::create(CurrentFnSym, Ctx),
+        MCConstantExpr::create(EncodedNumBlocks, Ctx), Ctx);
+    MCSymbol *DVgprFuncSym =
+        Ctx.getOrCreateSymbol(Twine("_dvgpr$") + CurrentFnSym->getName());
+    OutStreamer->emitAssignment(DVgprFuncSym, DVgprFuncVal);
+    emitVisibility(DVgprFuncSym, MF.getFunction().getVisibility());
+    emitLinkage(&MF.getFunction(), DVgprFuncSym);
+  }
+}
+
 // TODO: Fold this into emitFunctionBodyStart.
 void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
   // In the beginning all features are either 'Any' or 'NotSupported',

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 63589d2b90062..9e854fa554672 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -54,6 +54,9 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
 
   MCCodeEmitter *DumpCodeInstEmitter = nullptr;
 
+  // When appropriate, add a _dvgpr$ symbol.
+  void emitDVgprSymbol(MachineFunction &MF);
+
   void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF);
   void getAmdKernelCode(AMDGPU::AMDGPUMCKernelCodeT &Out,
                         const SIProgramInfo &KernelInfo,

diff  --git a/llvm/test/CodeGen/AMDGPU/dvgpr_sym.ll b/llvm/test/CodeGen/AMDGPU/dvgpr_sym.ll
new file mode 100644
index 0000000000000..a7c1c223546f5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/dvgpr_sym.ll
@@ -0,0 +1,70 @@
+; Test generation of _dvgpr$ symbol for an amdgpu_cs_chain function with dynamic vgprs.
+
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefixes=DVGPR %s
+
+; Function with 0 VGPRs, which counts as 1 block.
+;
+; DVGPR-LABEL: func0:
+; DVGPR: .set _dvgpr$func0, func0+0
+;
+define amdgpu_cs_chain void @func0() #0 {
+  ret void
+}
+
+; Function with 21 VGPRs, which is 2 blocks.
+;
+; DVGPR-LABEL: func21:
+; DVGPR: .set func21.num_vgpr, 21
+; DVGPR: .set _dvgpr$func21, func21+8
+;
+define amdgpu_cs_chain void @func21(<13 x float> %arg) #0 {
+  tail call void @func21(<13 x float> %arg)
+  ret void
+}
+
+; Anonymous function with 87 VGPRs, which is 6 blocks.
+;
+; DVGPR: [[FUNC87:__unnamed[^:]*]]:
+; DVGPR: .set [[FUNC87]].num_vgpr, 87
+; DVGPR: .set _dvgpr$[[FUNC87]], [[FUNC87]]+40
+;
+define amdgpu_cs_chain void @0(<79 x float> %arg) #0 {
+  tail call void @0(<79 x float> %arg)
+  ret void
+}
+
+; Function with 128 VGPRs, which is 8 blocks.
+;
+; DVGPR-LABEL: func128:
+; DVGPR: .set func128.num_vgpr, 128
+; DVGPR: .set _dvgpr$func128, func128+56
+;
+define amdgpu_cs_chain void @func128(<120 x float> %arg) #0 {
+  tail call void @func128(<120 x float> %arg)
+  ret void
+}
+
+; Function with 79 VGPRs, which is 3 blocks with a block size of 32.
+;
+; DVGPR-LABEL: func79:
+; DVGPR: .set func79.num_vgpr, 79
+; DVGPR: .set _dvgpr$func79, func79+16
+;
+define amdgpu_cs_chain void @func79(<71 x float> %arg) #1 {
+  tail call void @func79(<71 x float> %arg)
+  ret void
+}
+
+; Function with 225 VGPRs, which is 8 blocks with a block size of 32.
+;
+; DVGPR-LABEL: func225:
+; DVGPR: .set func225.num_vgpr, 225
+; DVGPR: .set _dvgpr$func225, func225+56
+;
+define amdgpu_cs_chain void @func225(<217 x float> %arg) #1 {
+  tail call void @func225(<217 x float> %arg)
+  ret void
+}
+
+attributes #0 = { "amdgpu-dynamic-vgpr-block-size"="16" }
+attributes #1 = { "amdgpu-dynamic-vgpr-block-size"="32" }

diff  --git a/llvm/test/CodeGen/AMDGPU/dvgpr_sym_fail_too_many_block_size_16.ll b/llvm/test/CodeGen/AMDGPU/dvgpr_sym_fail_too_many_block_size_16.ll
new file mode 100644
index 0000000000000..362a5e7286285
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/dvgpr_sym_fail_too_many_block_size_16.ll
@@ -0,0 +1,15 @@
+; Test failure to generate the _dvgpr$ symbol for an amdgpu_cs_chain function with dynamic vgprs.
+
+; RUN: not llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s 2>&1 | FileCheck -check-prefixes=ERR %s
+
+; Function with 129 VGPRs, which is too many with a block size of 16.
+;
+; ERR-DAG: .set func129.num_vgpr, 129
+; ERR-DAG: too many DVGPR blocks for _dvgpr$ symbol for 'func129'
+;
+define amdgpu_cs_chain void @func129(<121 x float> %arg) #0 {
+  tail call void @func129(<121 x float> %arg)
+  ret void
+}
+
+attributes #0 = { "amdgpu-dynamic-vgpr-block-size"="16" }

diff  --git a/llvm/test/CodeGen/AMDGPU/dvgpr_sym_fail_too_many_block_size_16_anon.ll b/llvm/test/CodeGen/AMDGPU/dvgpr_sym_fail_too_many_block_size_16_anon.ll
new file mode 100644
index 0000000000000..218c009809a50
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/dvgpr_sym_fail_too_many_block_size_16_anon.ll
@@ -0,0 +1,24 @@
+; Test failure to generate the _dvgpr$ symbol for an anonymous amdgpu_cs_chain function with dynamic vgprs.
+
+; RUN: not llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s 2>&1 | FileCheck -check-prefixes=ERR %s
+
+; Anonymous function with 129 VGPRs, which is too many with a block size of 16.
+;
+; ERR-DAG: .set __unnamed_1.num_vgpr, 129
+; ERR-DAG: too many DVGPR blocks for _dvgpr$ symbol for '__unnamed_1'
+;
+define amdgpu_cs_chain void @0(<121 x float> %arg) #0 {
+  tail call void @0(<121 x float> %arg)
+  ret void
+}
+
+; Function that is OK, that chains to @1.
+;
+define amdgpu_cs_chain void @funcOk(<16 x float> %arg) {
+  %vec87 = shufflevector <16 x float> %arg, <16 x float> %arg, <121 x i32> splat(i32 0)
+  tail call void @0(<121 x float> %vec87)
+  ret void
+}
+
+attributes #0 = { "amdgpu-dynamic-vgpr-block-size"="16" }
+