[llvm] [AMDGPU] Avoid resource propagation for recursion through multiple functions (PR #111004)

Thu Oct 3 08:21:37 PDT 2024

https://github.com/JanekvO created https://github.com/llvm/llvm-project/pull/111004

Avoid constructing recursive MCExpr definitions when multiple functions cause a recursion.

Fixes #110863 

>From ded865a8b1a42de35d99612224825c591766d2c9 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <janek.vanoirschot at amd.com>
Date: Thu, 3 Oct 2024 16:13:46 +0100
Subject: [PATCH] [AMDGPU] Avoid metadata propagation for recursion through
 multiple functions

---
 .../Target/AMDGPU/AMDGPUMCResourceInfo.cpp    |  7 +-
 .../CodeGen/AMDGPU/agpr-register-count.ll     |  2 +-
 .../amdpal-metadata-agpr-register-count.ll    |  2 +-
 .../CodeGen/AMDGPU/function-resource-usage.ll | 73 ++++++++++++++++
 .../hsa-metadata-agpr-register-count.ll       |  2 +-
 llvm/test/CodeGen/AMDGPU/ipra.ll              |  4 +-
 llvm/test/CodeGen/AMDGPU/recursion.ll         | 18 ++--
 llvm/test/CodeGen/AMDGPU/recursive-mcexpr.ll  | 85 +++++++++++++++++++
 llvm/test/CodeGen/AMDGPU/sibling-call.ll      |  4 +-
 9 files changed, 180 insertions(+), 17 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/recursive-mcexpr.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
index da0397fa20bd1b..dd970c78e66e91 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
@@ -109,10 +109,13 @@ void MCResourceInfo::assignResourceInfoExpr(
     for (const Function *Callee : Callees) {
       if (!Seen.insert(Callee).second)
         continue;
+      if (!F.doesNotRecurse() && !Callee->doesNotRecurse())
+        continue;
       MCSymbol *CalleeValSym = getSymbol(Callee->getName(), RIK, OutContext);
       ArgExprs.push_back(MCSymbolRefExpr::create(CalleeValSym, OutContext));
     }
-    SymVal = AMDGPUMCExpr::create(Kind, ArgExprs, OutContext);
+    if (ArgExprs.size() > 1)
+      SymVal = AMDGPUMCExpr::create(Kind, ArgExprs, OutContext);
   }
   MCSymbol *Sym = getSymbol(MF.getName(), RIK, OutContext);
   Sym->setVariableValue(SymVal);
@@ -164,6 +167,8 @@ void MCResourceInfo::gatherResourceInfo(
     for (const Function *Callee : FRI.Callees) {
       if (!Seen.insert(Callee).second)
         continue;
+      if (!MF.getFunction().doesNotRecurse() && !Callee->doesNotRecurse())
+        continue;
       if (!Callee->isDeclaration()) {
         MCSymbol *calleeValSym =
             getSymbol(Callee->getName(), RIK_PrivateSegSize, OutContext);
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
index 0e16ea10c019ac..189b2d80827896 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
@@ -178,4 +178,4 @@ bb:
 ; GCN-NEXT: .set amdgpu.max_num_agpr, 32
 ; GCN-NEXT: .set amdgpu.max_num_sgpr, 34
 
-attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { nounwind noinline norecurse "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll
index 8f4cb364751d88..a01bfc6c8730da 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll
@@ -77,4 +77,4 @@ bb:
 ; GFX908: agpr_count:  0x20
 ; GFX908: vgpr_count:  0x20
 
-attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" }
+attributes #0 = { nounwind noinline norecurse "amdgpu-flat-work-group-size"="1,512" }
diff --git a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
index d3a6b4e01ebfb8..3c57689781378f 100644
--- a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
@@ -481,6 +481,79 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
   ret void
 }
 
+; GCN-LABEL: {{^}}multi_stage_recurse2:
+; GCN: .set multi_stage_recurse2.num_vgpr, 41
+; GCN: .set multi_stage_recurse2.num_agpr, 0
+; GCN: .set multi_stage_recurse2.numbered_sgpr, 34
+; GCN: .set multi_stage_recurse2.private_seg_size, 16
+; GCN: .set multi_stage_recurse2.uses_vcc, 1
+; GCN: .set multi_stage_recurse2.uses_flat_scratch, 0
+; GCN: .set multi_stage_recurse2.has_dyn_sized_stack, 0
+; GCN: .set multi_stage_recurse2.has_recursion, 1
+; GCN: .set multi_stage_recurse2.has_indirect_call, 0
+; GCN: TotalNumSgprs: 38
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 16
+; GCN-LABEL: {{^}}multi_stage_recurse1:
+; GCN: .set multi_stage_recurse1.num_vgpr, 41
+; GCN: .set multi_stage_recurse1.num_agpr, 0
+; GCN: .set multi_stage_recurse1.numbered_sgpr, 34
+; GCN: .set multi_stage_recurse1.private_seg_size, 16
+; GCN: .set multi_stage_recurse1.uses_vcc, 1
+; GCN: .set multi_stage_recurse1.uses_flat_scratch, 0
+; GCN: .set multi_stage_recurse1.has_dyn_sized_stack, 0
+; GCN: .set multi_stage_recurse1.has_recursion, 1
+; GCN: .set multi_stage_recurse1.has_indirect_call, 0
+; GCN: TotalNumSgprs: 38
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 16
+define void @multi_stage_recurse1(i32 %val) #2 {
+  call void @multi_stage_recurse2(i32 %val)
+  ret void
+}
+define void @multi_stage_recurse2(i32 %val) #2 {
+  call void @multi_stage_recurse1(i32 %val)
+  ret void
+}
+
+; GCN-LABEL: {{^}}usage_multi_stage_recurse:
+; GCN: .set usage_multi_stage_recurse.num_vgpr, max(32, multi_stage_recurse1.num_vgpr)
+; GCN: .set usage_multi_stage_recurse.num_agpr, max(0, multi_stage_recurse1.num_agpr)
+; GCN: .set usage_multi_stage_recurse.numbered_sgpr, max(33, multi_stage_recurse1.numbered_sgpr)
+; GCN: .set usage_multi_stage_recurse.private_seg_size, 0+(max(multi_stage_recurse1.private_seg_size))
+; GCN: .set usage_multi_stage_recurse.uses_vcc, or(1, multi_stage_recurse1.uses_vcc)
+; GCN: .set usage_multi_stage_recurse.uses_flat_scratch, or(1, multi_stage_recurse1.uses_flat_scratch)
+; GCN: .set usage_multi_stage_recurse.has_dyn_sized_stack, or(0, multi_stage_recurse1.has_dyn_sized_stack)
+; GCN: .set usage_multi_stage_recurse.has_recursion, or(1, multi_stage_recurse1.has_recursion)
+; GCN: .set usage_multi_stage_recurse.has_indirect_call, or(0, multi_stage_recurse1.has_indirect_call)
+; GCN: TotalNumSgprs: 40
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 16
+define amdgpu_kernel void @usage_multi_stage_recurse(i32 %n) #0 {
+  call void @multi_stage_recurse1(i32 %n)
+  ret void
+}
+
+; GCN-LABEL: {{^}}multi_call_with_multi_stage_recurse:
+; GCN:  .set multi_call_with_multi_stage_recurse.num_vgpr, max(41, use_stack0.num_vgpr, use_stack1.num_vgpr, multi_stage_recurse1.num_vgpr)
+; GCN:  .set multi_call_with_multi_stage_recurse.num_agpr, max(0, use_stack0.num_agpr, use_stack1.num_agpr, multi_stage_recurse1.num_agpr)
+; GCN:  .set multi_call_with_multi_stage_recurse.numbered_sgpr, max(43, use_stack0.numbered_sgpr, use_stack1.numbered_sgpr, multi_stage_recurse1.numbered_sgpr)
+; GCN:  .set multi_call_with_multi_stage_recurse.private_seg_size, 0+(max(use_stack0.private_seg_size, use_stack1.private_seg_size, multi_stage_recurse1.private_seg_size))
+; GCN:  .set multi_call_with_multi_stage_recurse.uses_vcc, or(1, use_stack0.uses_vcc, use_stack1.uses_vcc, multi_stage_recurse1.uses_vcc)
+; GCN:  .set multi_call_with_multi_stage_recurse.uses_flat_scratch, or(1, use_stack0.uses_flat_scratch, use_stack1.uses_flat_scratch, multi_stage_recurse1.uses_flat_scratch)
+; GCN:  .set multi_call_with_multi_stage_recurse.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack, use_stack1.has_dyn_sized_stack, multi_stage_recurse1.has_dyn_sized_stack)
+; GCN:  .set multi_call_with_multi_stage_recurse.has_recursion, or(1, use_stack0.has_recursion, use_stack1.has_recursion, multi_stage_recurse1.has_recursion)
+; GCN:  .set multi_call_with_multi_stage_recurse.has_indirect_call, or(0, use_stack0.has_indirect_call, use_stack1.has_indirect_call, multi_stage_recurse1.has_indirect_call)
+; GCN: TotalNumSgprs: 49
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 2052
+define amdgpu_kernel void @multi_call_with_multi_stage_recurse(i32 %n) #0 {
+  call void @use_stack0()
+  call void @use_stack1()
+  call void @multi_stage_recurse1(i32 %n)
+  ret void
+}
+
 ; Make sure there's no assert when a sgpr96 is used.
 ; GCN-LABEL: {{^}}count_use_sgpr96_external_call
 ; GCN:	.set count_use_sgpr96_external_call.num_vgpr, max(32, amdgpu.max_num_vgpr)
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll
index 380a8e911e4995..fdc3f3957b828b 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll
@@ -98,4 +98,4 @@ bb:
   ret void
 }
 
-attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { nounwind noinline norecurse "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll
index 957f404c8cdbed..85d23323ec6dec 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra.ll
@@ -131,6 +131,6 @@ bb:
 declare dso_local void @eggs()
 
 
-attributes #0 = { nounwind }
-attributes #1 = { nounwind noinline "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { nounwind norecurse }
+attributes #1 = { nounwind noinline norecurse "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
 attributes #2 = { norecurse }
diff --git a/llvm/test/CodeGen/AMDGPU/recursion.ll b/llvm/test/CodeGen/AMDGPU/recursion.ll
index c0d228e1254e64..479f46faf79d3e 100644
--- a/llvm/test/CodeGen/AMDGPU/recursion.ll
+++ b/llvm/test/CodeGen/AMDGPU/recursion.ll
@@ -41,11 +41,11 @@ define void @tail_recursive_with_stack() {
 ; For an arbitrary recursive call, report a large number for unknown stack
 ; usage for code object v4 and older
 ; CHECK-LABEL: {{^}}calls_recursive:
-; CHECK: .set calls_recursive.private_seg_size, 0+(max(16384, recursive.private_seg_size))
+; CHECK: .set calls_recursive.private_seg_size, 0+(max(16384))
 ;
 ; V5-LABEL: {{^}}calls_recursive:
-; V5: .set calls_recursive.private_seg_size, 0+(max(recursive.private_seg_size))
-; V5: .set calls_recursive.has_dyn_sized_stack, or(0, recursive.has_dyn_sized_stack)
+; V5: .set calls_recursive.private_seg_size, 0
+; V5: .set calls_recursive.has_dyn_sized_stack, 0
 define amdgpu_kernel void @calls_recursive() {
   call void @recursive()
   ret void
@@ -65,22 +65,22 @@ define amdgpu_kernel void @kernel_indirectly_calls_tail_recursive() {
 ; in the kernel.
 
 ; CHECK-LABEL: {{^}}kernel_calls_tail_recursive:
-; CHECK: .set kernel_calls_tail_recursive.private_seg_size, 0+(max(16384, tail_recursive.private_seg_size))
+; CHECK: .set kernel_calls_tail_recursive.private_seg_size, 0+(max(16384))
 ;
 ; V5-LABEL: {{^}}kernel_calls_tail_recursive:
-; V5: .set kernel_calls_tail_recursive.private_seg_size, 0+(max(tail_recursive.private_seg_size))
-; V5: .set kernel_calls_tail_recursive.has_recursion, or(1, tail_recursive.has_recursion)
+; V5: .set kernel_calls_tail_recursive.private_seg_size, 0
+; V5: .set kernel_calls_tail_recursive.has_recursion, 1
 define amdgpu_kernel void @kernel_calls_tail_recursive() {
   call void @tail_recursive()
   ret void
 }
 
 ; CHECK-LABEL: {{^}}kernel_calls_tail_recursive_with_stack:
-; CHECK: .set kernel_calls_tail_recursive_with_stack.private_seg_size, 0+(max(16384, tail_recursive_with_stack.private_seg_size))
+; CHECK: .set kernel_calls_tail_recursive_with_stack.private_seg_size, 0+(max(16384))
 ;
 ; V5-LABEL: {{^}}kernel_calls_tail_recursive_with_stack:
-; V5: .set kernel_calls_tail_recursive_with_stack.private_seg_size, 0+(max(tail_recursive_with_stack.private_seg_size))
-; V5: .set kernel_calls_tail_recursive_with_stack.has_dyn_sized_stack, or(0, tail_recursive_with_stack.has_dyn_sized_stack)
+; V5: .set kernel_calls_tail_recursive_with_stack.private_seg_size, 0
+; V5: .set kernel_calls_tail_recursive_with_stack.has_dyn_sized_stack, 0
 define amdgpu_kernel void @kernel_calls_tail_recursive_with_stack() {
   call void @tail_recursive_with_stack()
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/recursive-mcexpr.ll b/llvm/test/CodeGen/AMDGPU/recursive-mcexpr.ll
new file mode 100644
index 00000000000000..630a0923e31287
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/recursive-mcexpr.ll
@@ -0,0 +1,85 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -mcpu=gfx90a < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}qux
+; CHECK: .set qux.num_vgpr, 41
+; CHECK: .set qux.num_agpr, 0
+; CHECK: .set qux.numbered_sgpr, 34
+; CHECK: .set qux.private_seg_size, 16
+; CHECK: .set qux.uses_vcc, 1
+; CHECK: .set qux.uses_flat_scratch, 0
+; CHECK: .set qux.has_dyn_sized_stack, 0
+; CHECK: .set qux.has_recursion, 1
+; CHECK: .set qux.has_indirect_call, 0
+
+; CHECK-LABEL: {{^}}baz
+; CHECK: .set baz.num_vgpr, 42
+; CHECK: .set baz.num_agpr, 0
+; CHECK: .set baz.numbered_sgpr, 34
+; CHECK: .set baz.private_seg_size, 16
+; CHECK: .set baz.uses_vcc, 1
+; CHECK: .set baz.uses_flat_scratch, 0
+; CHECK: .set baz.has_dyn_sized_stack, 0
+; CHECK: .set baz.has_recursion, 1
+; CHECK: .set baz.has_indirect_call, 0
+
+; CHECK-LABEL: {{^}}bar
+; CHECK: .set bar.num_vgpr, 42
+; CHECK: .set bar.num_agpr, 0
+; CHECK: .set bar.numbered_sgpr, 34
+; CHECK: .set bar.private_seg_size, 16
+; CHECK: .set bar.uses_vcc, 1
+; CHECK: .set bar.uses_flat_scratch, 0
+; CHECK: .set bar.has_dyn_sized_stack, 0
+; CHECK: .set bar.has_recursion, 1
+; CHECK: .set bar.has_indirect_call, 0
+
+; CHECK-LABEL: {{^}}foo
+; CHECK: .set foo.num_vgpr, 42
+; CHECK: .set foo.num_agpr, 0
+; CHECK: .set foo.numbered_sgpr, 34
+; CHECK: .set foo.private_seg_size, 16
+; CHECK: .set foo.uses_vcc, 1
+; CHECK: .set foo.uses_flat_scratch, 0
+; CHECK: .set foo.has_dyn_sized_stack, 0
+; CHECK: .set foo.has_recursion, 1
+; CHECK: .set foo.has_indirect_call, 0
+
+define void @foo() {
+entry:
+  call void @bar()
+  ret void
+}
+
+define void @bar() {
+entry:
+  call void @baz()
+  ret void
+}
+
+define void @baz() {
+entry:
+  call void @qux()
+  ret void
+}
+
+define void @qux() {
+entry:
+  call void @foo()
+  ret void
+}
+
+; CHECK-LABEL: {{^}}usefoo
+; CHECK: .set usefoo.num_vgpr, 32
+; CHECK: .set usefoo.num_agpr, 0
+; CHECK: .set usefoo.numbered_sgpr, 33
+; CHECK: .set usefoo.private_seg_size, 0
+; CHECK: .set usefoo.uses_vcc, 1
+; CHECK: .set usefoo.uses_flat_scratch, 1
+; CHECK: .set usefoo.has_dyn_sized_stack, 0
+; CHECK: .set usefoo.has_recursion, 1
+; CHECK: .set usefoo.has_indirect_call, 0
+define amdgpu_kernel void @usefoo() {
+  call void @foo()
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
index 5536a09538e6ee..4639bf4a678d49 100644
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -468,5 +468,5 @@ entry:
   ret <2 x i64> %ret
 }
 
-attributes #0 = { nounwind }
-attributes #1 = { nounwind noinline "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { nounwind norecurse }
+attributes #1 = { nounwind noinline norecurse "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }