[llvm] [AMDGPU] Flatten recursive register resource info propagation (PR #142766)

Janek van Oirschot via llvm-commits llvm-commits at lists.llvm.org
Mon Jun 9 04:15:10 PDT 2025


https://github.com/JanekvO updated https://github.com/llvm/llvm-project/pull/142766

>From a7e3c8eaa19eaa48357642ddc6b756f00d0b1fe9 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <Janek.vanOirschot at amd.com>
Date: Wed, 4 Jun 2025 03:57:25 -0700
Subject: [PATCH 1/2] [AMDGPU] Flatten recursive register resource info
 propagation

---
 .../Target/AMDGPU/AMDGPUMCResourceInfo.cpp    | 101 +++++++++++++++---
 llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h |   6 ++
 .../CodeGen/AMDGPU/function-resource-usage.ll |  32 +++---
 .../AMDGPU/recursive-resource-usage-mcexpr.ll |  82 +++++++++++++-
 4 files changed, 189 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
index 7d2596d666185..37a3b99baa2ac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
@@ -97,6 +97,87 @@ MCSymbol *MCResourceInfo::getMaxSGPRSymbol(MCContext &OutContext) {
   return OutContext.getOrCreateSymbol("amdgpu.max_num_sgpr");
 }
 
+// Tries to flatten recursive call register resource gathering. Simple cycle
+// avoiding dfs to find the constants in the propagated symbols.
+// Assumes:
+// - RecSym has been confirmed to recurse (this means the callee symbols should
+//   all be populated, started at RecSym).
+// - Shape of the resource symbol's MCExpr (`max` args are order agnostic):
+//   RecSym.MCExpr := max(<constant>+, <callee_symbol>*)
+const MCExpr *MCResourceInfo::flattenedCycleMax(MCSymbol *RecSym,
+                                                ResourceInfoKind RIK,
+                                                MCContext &OutContext) {
+  SmallPtrSet<const MCExpr *, 8> Seen;
+  SmallVector<const MCExpr *, 8> WorkList;
+  int64_t Maximum = 0;
+
+  const MCExpr *RecExpr = RecSym->getVariableValue();
+  WorkList.push_back(RecExpr);
+
+  while (!WorkList.empty()) {
+    const MCExpr *CurExpr = WorkList.pop_back_val();
+    switch (CurExpr->getKind()) {
+    default: {
+      // Assuming the recursion is of shape `max(<constant>, <callee_symbol>)`
+      // where <callee_symbol> will eventually recurse. If this condition holds,
+      // the recursion occurs within some other (possibly unresolvable) MCExpr,
+      // thus using the worst case value then.
+      if (CurExpr->isSymbolUsedInExpression(RecSym)) {
+        LLVM_DEBUG(dbgs() << "MCResUse:   " << RecSym->getName()
+                          << ": Recursion in unexpected sub-expression, using "
+                             "module maximum\n");
+        switch (RIK) {
+        default:
+          break;
+        case RIK_NumVGPR:
+          return MCSymbolRefExpr::create(getMaxVGPRSymbol(OutContext),
+                                         OutContext);
+          break;
+        case RIK_NumSGPR:
+          return MCSymbolRefExpr::create(getMaxSGPRSymbol(OutContext),
+                                         OutContext);
+          break;
+        case RIK_NumAGPR:
+          return MCSymbolRefExpr::create(getMaxAGPRSymbol(OutContext),
+                                         OutContext);
+          break;
+        }
+      }
+      break;
+    }
+    case MCExpr::ExprKind::Constant: {
+      int64_t Val = cast<MCConstantExpr>(CurExpr)->getValue();
+      Maximum = std::max(Maximum, Val);
+      break;
+    }
+    case MCExpr::ExprKind::SymbolRef: {
+      const MCSymbolRefExpr *SymExpr = cast<MCSymbolRefExpr>(CurExpr);
+      const MCSymbol &SymRef = SymExpr->getSymbol();
+      if (SymRef.isVariable()) {
+        const MCExpr *SymVal = SymRef.getVariableValue();
+        auto [_, IsSeen] = Seen.insert(SymVal);
+        if (IsSeen)
+          WorkList.push_back(SymVal);
+      }
+      break;
+    }
+    case MCExpr::ExprKind::Target: {
+      const AMDGPUMCExpr *TargetExpr = cast<AMDGPUMCExpr>(CurExpr);
+      if (TargetExpr->getKind() == AMDGPUMCExpr::VariantKind::AGVK_Max) {
+        for (auto &Arg : TargetExpr->getArgs())
+          WorkList.push_back(Arg);
+      }
+      break;
+    }
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "MCResUse:   " << RecSym->getName()
+                    << ": Using flattened max: << " << Maximum << '\n');
+
+  return MCConstantExpr::create(Maximum, OutContext);
+}
+
 void MCResourceInfo::assignResourceInfoExpr(
     int64_t LocalValue, ResourceInfoKind RIK, AMDGPUMCExpr::VariantKind Kind,
     const MachineFunction &MF, const SmallVectorImpl<const Function *> &Callees,
@@ -132,25 +213,19 @@ void MCResourceInfo::assignResourceInfoExpr(
                           << CalleeValSym->getName() << " as callee\n");
         ArgExprs.push_back(MCSymbolRefExpr::create(CalleeValSym, OutContext));
       } else {
-        LLVM_DEBUG(
-            dbgs() << "MCResUse:   " << Sym->getName()
-                   << ": Recursion found, falling back to module maximum\n");
-        // In case of recursion: make sure to use conservative register counts
-        // (i.e., specifically for VGPR/SGPR/AGPR).
+        LLVM_DEBUG(dbgs() << "MCResUse:   " << Sym->getName()
+                          << ": Recursion found, attempt flattening of cycle "
+                             "for resource usage\n");
+        // In case of recursion for vgpr/sgpr/agpr resource usage: try to
+        // flatten and use the max of the call cycle. May still end up emitting
+        // module max if not fully resolvable.
         switch (RIK) {
         default:
           break;
         case RIK_NumVGPR:
-          ArgExprs.push_back(MCSymbolRefExpr::create(
-              getMaxVGPRSymbol(OutContext), OutContext));
-          break;
         case RIK_NumSGPR:
-          ArgExprs.push_back(MCSymbolRefExpr::create(
-              getMaxSGPRSymbol(OutContext), OutContext));
-          break;
         case RIK_NumAGPR:
-          ArgExprs.push_back(MCSymbolRefExpr::create(
-              getMaxAGPRSymbol(OutContext), OutContext));
+          ArgExprs.push_back(flattenedCycleMax(CalleeValSym, RIK, OutContext));
           break;
         }
       }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
index a670878948c31..fa98f82d11022 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
@@ -58,6 +58,12 @@ class MCResourceInfo {
   // Assigns expression for Max S/V/A-GPRs to the referenced symbols.
   void assignMaxRegs(MCContext &OutContext);
 
+  // Take flattened max of cyclic function calls' knowns. For example, for
+  // a cycle A->B->C->D->A, take max(A, B, C, D) for A and have B, C, D have the
+  // propgated value from A.
+  const MCExpr *flattenedCycleMax(MCSymbol *RecSym, ResourceInfoKind RIK,
+                                  MCContext &OutContext);
+
 public:
   MCResourceInfo() = default;
   void addMaxVGPRCandidate(int32_t candidate) {
diff --git a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
index 0a6aa05c2d212..2a18d40e0bd8a 100644
--- a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
@@ -495,17 +495,17 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
 ; GCN: NumVgprs: max(43, multi_stage_recurse1.num_vgpr)
 ; GCN: ScratchSize: 16+max(multi_stage_recurse1.private_seg_size)
 ; GCN-LABEL: {{^}}multi_stage_recurse1:
-; GCN: .set multi_stage_recurse1.num_vgpr, max(48, amdgpu.max_num_vgpr)
-; GCN: .set multi_stage_recurse1.num_agpr, max(0, amdgpu.max_num_agpr)
-; GCN: .set multi_stage_recurse1.numbered_sgpr, max(34, amdgpu.max_num_sgpr)
+; GCN: .set multi_stage_recurse1.num_vgpr, max(48, 43)
+; GCN: .set multi_stage_recurse1.num_agpr, max(0, 0)
+; GCN: .set multi_stage_recurse1.numbered_sgpr, max(34, 34)
 ; GCN: .set multi_stage_recurse1.private_seg_size, 16
 ; GCN: .set multi_stage_recurse1.uses_vcc, 1
 ; GCN: .set multi_stage_recurse1.uses_flat_scratch, 0
 ; GCN: .set multi_stage_recurse1.has_dyn_sized_stack, 0
 ; GCN: .set multi_stage_recurse1.has_recursion, 1
 ; GCN: .set multi_stage_recurse1.has_indirect_call, 0
-; GCN: TotalNumSgprs: multi_stage_recurse1.numbered_sgpr+4
-; GCN: NumVgprs: max(48, amdgpu.max_num_vgpr)
+; GCN: TotalNumSgprs: 38
+; GCN: NumVgprs: 48
 ; GCN: ScratchSize: 16
 define void @multi_stage_recurse1(i32 %val) #2 {
   call void @multi_stage_recurse2(i32 %val)
@@ -528,8 +528,8 @@ define void @multi_stage_recurse2(i32 %val) #2 {
 ; GCN: .set usage_multi_stage_recurse.has_dyn_sized_stack, or(0, multi_stage_recurse1.has_dyn_sized_stack)
 ; GCN: .set usage_multi_stage_recurse.has_recursion, or(1, multi_stage_recurse1.has_recursion)
 ; GCN: .set usage_multi_stage_recurse.has_indirect_call, or(0, multi_stage_recurse1.has_indirect_call)
-; GCN: TotalNumSgprs: usage_multi_stage_recurse.numbered_sgpr+6
-; GCN: NumVgprs: usage_multi_stage_recurse.num_vgpr
+; GCN: TotalNumSgprs: 40
+; GCN: NumVgprs: 48
 ; GCN: ScratchSize: 16
 define amdgpu_kernel void @usage_multi_stage_recurse(i32 %n) #0 {
   call void @multi_stage_recurse1(i32 %n)
@@ -550,17 +550,17 @@ define amdgpu_kernel void @usage_multi_stage_recurse(i32 %n) #0 {
 ; GCN: NumVgprs: max(41, multi_stage_recurse_noattr1.num_vgpr)
 ; GCN: ScratchSize: 16+max(multi_stage_recurse_noattr1.private_seg_size)
 ; GCN-LABEL: {{^}}multi_stage_recurse_noattr1:
-; GCN: .set multi_stage_recurse_noattr1.num_vgpr, max(41, amdgpu.max_num_vgpr)
-; GCN: .set multi_stage_recurse_noattr1.num_agpr, max(0, amdgpu.max_num_agpr)
-; GCN: .set multi_stage_recurse_noattr1.numbered_sgpr, max(57, amdgpu.max_num_sgpr)
+; GCN: .set multi_stage_recurse_noattr1.num_vgpr, max(41, 41)
+; GCN: .set multi_stage_recurse_noattr1.num_agpr, max(0, 0)
+; GCN: .set multi_stage_recurse_noattr1.numbered_sgpr, max(57, 54)
 ; GCN: .set multi_stage_recurse_noattr1.private_seg_size, 16
 ; GCN: .set multi_stage_recurse_noattr1.uses_vcc, 1
 ; GCN: .set multi_stage_recurse_noattr1.uses_flat_scratch, 0
 ; GCN: .set multi_stage_recurse_noattr1.has_dyn_sized_stack, 0
 ; GCN: .set multi_stage_recurse_noattr1.has_recursion, 0
 ; GCN: .set multi_stage_recurse_noattr1.has_indirect_call, 0
-; GCN: TotalNumSgprs: multi_stage_recurse_noattr1.numbered_sgpr+4
-; GCN: NumVgprs: max(41, amdgpu.max_num_vgpr)
+; GCN: TotalNumSgprs: 61
+; GCN: NumVgprs: 41
 ; GCN: ScratchSize: 16
 define void @multi_stage_recurse_noattr1(i32 %val) #0 {
   call void @multi_stage_recurse_noattr2(i32 %val)
@@ -583,8 +583,8 @@ define void @multi_stage_recurse_noattr2(i32 %val) #0 {
 ; GCN: .set usage_multi_stage_recurse_noattrs.has_dyn_sized_stack, or(0, multi_stage_recurse_noattr1.has_dyn_sized_stack)
 ; GCN: .set usage_multi_stage_recurse_noattrs.has_recursion, or(0, multi_stage_recurse_noattr1.has_recursion)
 ; GCN: .set usage_multi_stage_recurse_noattrs.has_indirect_call, or(0, multi_stage_recurse_noattr1.has_indirect_call)
-; GCN: TotalNumSgprs: usage_multi_stage_recurse_noattrs.numbered_sgpr+6
-; GCN: NumVgprs: usage_multi_stage_recurse_noattrs.num_vgpr
+; GCN: TotalNumSgprs: 63
+; GCN: NumVgprs: 41
 ; GCN: ScratchSize: 16
 define amdgpu_kernel void @usage_multi_stage_recurse_noattrs(i32 %n) #0 {
   call void @multi_stage_recurse_noattr1(i32 %n)
@@ -601,8 +601,8 @@ define amdgpu_kernel void @usage_multi_stage_recurse_noattrs(i32 %n) #0 {
 ; GCN:  .set multi_call_with_multi_stage_recurse.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack, use_stack1.has_dyn_sized_stack, multi_stage_recurse1.has_dyn_sized_stack)
 ; GCN:  .set multi_call_with_multi_stage_recurse.has_recursion, or(1, use_stack0.has_recursion, use_stack1.has_recursion, multi_stage_recurse1.has_recursion)
 ; GCN:  .set multi_call_with_multi_stage_recurse.has_indirect_call, or(0, use_stack0.has_indirect_call, use_stack1.has_indirect_call, multi_stage_recurse1.has_indirect_call)
-; GCN: TotalNumSgprs: multi_call_with_multi_stage_recurse.numbered_sgpr+6
-; GCN: NumVgprs:  multi_call_with_multi_stage_recurse.num_vgpr
+; GCN: TotalNumSgprs: 59
+; GCN: NumVgprs:  48
 ; GCN: ScratchSize: 2052
 define amdgpu_kernel void @multi_call_with_multi_stage_recurse(i32 %n) #0 {
   call void @use_stack0()
diff --git a/llvm/test/CodeGen/AMDGPU/recursive-resource-usage-mcexpr.ll b/llvm/test/CodeGen/AMDGPU/recursive-resource-usage-mcexpr.ll
index 3093349bff37c..a41a06592f62f 100644
--- a/llvm/test/CodeGen/AMDGPU/recursive-resource-usage-mcexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/recursive-resource-usage-mcexpr.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck %s
 
+; Recursion: foo -> bar -> baz -> qux -> foo
+
 ; CHECK-LABEL: {{^}}qux
 ; CHECK: .set qux.num_vgpr, max(71, foo.num_vgpr)
 ; CHECK: .set qux.num_agpr, max(0, foo.num_agpr)
@@ -34,9 +36,9 @@
 ; CHECK: .set bar.has_indirect_call, or(0, baz.has_indirect_call)
 
 ; CHECK-LABEL: {{^}}foo
-; CHECK: .set foo.num_vgpr, max(46, amdgpu.max_num_vgpr)
-; CHECK: .set foo.num_agpr, max(0, amdgpu.max_num_agpr)
-; CHECK: .set foo.numbered_sgpr, max(71, amdgpu.max_num_sgpr)
+; CHECK: .set foo.num_vgpr, max(46, 71)
+; CHECK: .set foo.num_agpr, max(0, 0)
+; CHECK: .set foo.numbered_sgpr, max(71, 61)
 ; CHECK: .set foo.private_seg_size, 16
 ; CHECK: .set foo.uses_vcc, 1
 ; CHECK: .set foo.uses_flat_scratch, 0
@@ -91,3 +93,77 @@ define amdgpu_kernel void @usefoo() {
   ret void
 }
 
+; Recursion: A -> B -> C -> A && C -> D -> C
+
+; CHECK-LABEL: {{^}}D
+; CHECK: .set D.num_vgpr, max(71, C.num_vgpr)
+; CHECK: .set D.num_agpr, max(0, C.num_agpr)
+; CHECK: .set D.numbered_sgpr, max(71, C.numbered_sgpr)
+; CHECK: .set D.private_seg_size, 16+max(C.private_seg_size)
+; CHECK: .set D.uses_vcc, or(1, C.uses_vcc)
+; CHECK: .set D.uses_flat_scratch, or(0, C.uses_flat_scratch)
+; CHECK: .set D.has_dyn_sized_stack, or(0, C.has_dyn_sized_stack)
+; CHECK: .set D.has_recursion, or(1, C.has_recursion)
+; CHECK: .set D.has_indirect_call, or(0, C.has_indirect_call)
+
+; CHECK-LABEL: {{^}}C
+; CHECK: .set C.num_vgpr, max(42, A.num_vgpr, 71)
+; CHECK: .set C.num_agpr, max(0, A.num_agpr, 0)
+; CHECK: .set C.numbered_sgpr, max(71, A.numbered_sgpr, 71)
+; CHECK: .set C.private_seg_size, 16+max(A.private_seg_size)
+; CHECK: .set C.uses_vcc, or(1, A.uses_vcc)
+; CHECK: .set C.uses_flat_scratch, or(0, A.uses_flat_scratch)
+; CHECK: .set C.has_dyn_sized_stack, or(0, A.has_dyn_sized_stack)
+; CHECK: .set C.has_recursion, or(1, A.has_recursion)
+; CHECK: .set C.has_indirect_call, or(0, A.has_indirect_call)
+
+; CHECK-LABEL: {{^}}B
+; CHECK: .set B.num_vgpr, max(42, C.num_vgpr)
+; CHECK: .set B.num_agpr, max(0, C.num_agpr)
+; CHECK: .set B.numbered_sgpr, max(71, C.numbered_sgpr)
+; CHECK: .set B.private_seg_size, 16+max(C.private_seg_size)
+; CHECK: .set B.uses_vcc, or(1, C.uses_vcc)
+; CHECK: .set B.uses_flat_scratch, or(0, C.uses_flat_scratch)
+; CHECK: .set B.has_dyn_sized_stack, or(0, C.has_dyn_sized_stack)
+; CHECK: .set B.has_recursion, or(1, C.has_recursion)
+; CHECK: .set B.has_indirect_call, or(0, C.has_indirect_call)
+
+; CHECK-LABEL: {{^}}A
+; CHECK: .set A.num_vgpr, max(42, 71)
+; CHECK: .set A.num_agpr, max(0, 0)
+; CHECK: .set A.numbered_sgpr, max(71, 71)
+; CHECK: .set A.private_seg_size, 16
+; CHECK: .set A.uses_vcc, 1
+; CHECK: .set A.uses_flat_scratch, 0
+; CHECK: .set A.has_dyn_sized_stack, 0
+; CHECK: .set A.has_recursion, 1
+; CHECK: .set A.has_indirect_call, 0
+
+define void @A() {
+  call void @B()
+  call void asm sideeffect "", "~{v10}"()
+  call void asm sideeffect "", "~{s50}"()
+  ret void
+}
+
+define void @B() {
+  call void @C()
+  call void asm sideeffect "", "~{v20}"()
+  call void asm sideeffect "", "~{s30}"()
+  ret void
+}
+
+define void @C() {
+  call void @A()
+  call void @D()
+  call void asm sideeffect "", "~{v30}"()
+  call void asm sideeffect "", "~{s40}"()
+  ret void
+}
+
+define void @D() {
+  call void @C()
+  call void asm sideeffect "", "~{v70}"()
+  call void asm sideeffect "", "~{s70}"()
+  ret void
+}

>From 9032fd41c0f0eff58fa01c2c80db3d65dfdeeb44 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <Janek.vanOirschot at amd.com>
Date: Mon, 9 Jun 2025 04:14:24 -0700
Subject: [PATCH 2/2] feedback

---
 llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
index 37a3b99baa2ac..1dede11a0ed20 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
@@ -155,8 +155,7 @@ const MCExpr *MCResourceInfo::flattenedCycleMax(MCSymbol *RecSym,
       const MCSymbol &SymRef = SymExpr->getSymbol();
       if (SymRef.isVariable()) {
         const MCExpr *SymVal = SymRef.getVariableValue();
-        auto [_, IsSeen] = Seen.insert(SymVal);
-        if (IsSeen)
+        if (Seen.insert(SymVal).second)
           WorkList.push_back(SymVal);
       }
       break;



More information about the llvm-commits mailing list