[llvm] [AMDGPU] Use table strategy for LowerModuleLDSPass at O0 (PR #160181)

Mon Sep 22 13:58:12 PDT 2025

https://github.com/hjagasiaAMD updated https://github.com/llvm/llvm-project/pull/160181

>From edf3d4f7aa509e019655d4973554381172a23093 Mon Sep 17 00:00:00 2001
From: hjagasiaAMD <harsha.jagasia at amd.com>
Date: Mon, 22 Sep 2025 14:07:13 -0500
Subject: [PATCH] [AMDGPU] Use table strategy for LowerModuleLDSPass at O0

Ensure global variables accessed by only one kernel can stay in kernel
scope at O0 by switching to table strategy for AMDGPULowerModuleLDSPass.
This to prevent LDS limit from being exceeded for the kernel. At higher
Opt levels, additional passes run can acheive this without switching to
table strategy.
---
 .../AMDGPU/AMDGPULowerModuleLDSPass.cpp       | 11 ++-
 .../AMDGPU/lower-module-lds-force-table-O0.ll | 92 +++++++++++++++++++
 2 files changed, 102 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/lower-module-lds-force-table-O0.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index f01d5f6726822..dae2bd53b6623 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -588,7 +588,7 @@ class AMDGPULowerModuleLDS {
     return OrderedKernels;
   }
 
-  static void partitionVariablesIntoIndirectStrategies(
+  void partitionVariablesIntoIndirectStrategies(
       Module &M, LDSUsesInfoTy const &LDSUsesInfo,
       VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly,
       DenseSet<GlobalVariable *> &ModuleScopeVariables,
@@ -596,6 +596,9 @@ class AMDGPULowerModuleLDS {
       DenseSet<GlobalVariable *> &KernelAccessVariables,
       DenseSet<GlobalVariable *> &DynamicVariables) {
 
+    if (TM.getOptLevel() == CodeGenOptLevel::None)
+      LoweringKindLoc = LoweringKind::table;
+
     GlobalVariable *HybridModuleRoot =
         LoweringKindLoc != LoweringKind::hybrid
             ? nullptr
@@ -1188,6 +1191,8 @@ class AMDGPULowerModuleLDS {
           // Allocated at zero, recorded once on construction, not once per
           // kernel
           Offset += DL.getTypeAllocSize(MaybeModuleScopeStruct->getValueType());
+          LLVM_DEBUG(dbgs() << "amdgpu-lds-size after ModuleScopeStruct"
+                            << Offset << "\n");
         }
 
         if (AllocateKernelScopeStruct) {
@@ -1195,6 +1200,8 @@ class AMDGPULowerModuleLDS {
           Offset = alignTo(Offset, AMDGPU::getAlign(DL, KernelStruct));
           recordLDSAbsoluteAddress(&M, KernelStruct, Offset);
           Offset += DL.getTypeAllocSize(KernelStruct->getValueType());
+          LLVM_DEBUG(dbgs()
+                     << "amdgpu-lds-size after KernelStruct" << Offset << "\n");
         }
 
         // If there is dynamic allocation, the alignment needed is included in
@@ -1205,6 +1212,8 @@ class AMDGPULowerModuleLDS {
           GlobalVariable *DynamicVariable = KernelToCreatedDynamicLDS[&Func];
           Offset = alignTo(Offset, AMDGPU::getAlign(DL, DynamicVariable));
           recordLDSAbsoluteAddress(&M, DynamicVariable, Offset);
+          LLVM_DEBUG(dbgs() << "amdgpu-lds-size after DynamicVariable" << Offset
+                            << "\n");
         }
 
         if (Offset != 0) {
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-force-table-O0.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-force-table-O0.ll
new file mode 100644
index 0000000000000..fec5b47198917
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-force-table-O0.ll
@@ -0,0 +1,92 @@
+; RUN: not llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=null < %s 2>&1 | FileCheck --check-prefix=CHECK %s
+; CHECK-NOT: error: <unknown>:0:0: local memory (98304) exceeds limit (65536) in function 'k2'
+
+ at gA = internal addrspace(3) global [32768 x i8] undef, align 4
+ at gB = internal addrspace(3) global [32768 x i8] undef, align 4
+ at gC = internal addrspace(3) global [32768 x i8] undef, align 4
+
+; ---- Helpers ----
+
+define internal void @helperA() inlinehint {
+entry:
+  %p = getelementptr [32768 x i8], ptr addrspace(3) @gA, i32 0, i32 0
+  store i8 1, ptr addrspace(3) %p
+  ret void
+}
+
+define internal void @helperB() inlinehint {
+entry:
+  %p = getelementptr [32768 x i8], ptr addrspace(3) @gB, i32 0, i32 0
+  store i8 2, ptr addrspace(3) %p
+  ret void
+}
+
+define internal void @helperC() inlinehint {
+entry:
+  %p = getelementptr [32768 x i8], ptr addrspace(3) @gC, i32 0, i32 0
+  store i8 3, ptr addrspace(3) %p
+  ret void
+}
+
+; ---------------------------------------------------------------------------
+; Dispatch: takes an index and calls the appropriate helper.
+; If dispatch is NOT inlined, a backend lowering pass that conservatively
+; examines call targets may think all helpers (and thus all globals) are
+; potentially referenced by every kernel that calls dispatch.
+; ---------------------------------------------------------------------------
+
+define void @dispatch(i32 %idx) inlinehint {
+entry:
+  %cmp1 = icmp eq i32 %idx, 1
+  br i1 %cmp1, label %case1, label %check2
+
+check2:
+  %cmp2 = icmp eq i32 %idx, 2
+  br i1 %cmp2, label %case2, label %check3
+
+check3:
+  %cmp3 = icmp eq i32 %idx, 3
+  br i1 %cmp3, label %case3, label %default
+
+case1:
+  call void @helperA()
+  br label %done
+
+case2:
+  call void @helperB()
+  br label %done
+
+case3:
+  call void @helperC()
+  br label %done
+
+default:
+  ; fallthrough: call helperA to have a default behaviour
+  call void @helperA()
+  br label %done
+
+done:
+  ret void
+}
+
+; ---- Kernels ----
+
+define amdgpu_kernel void @k0() {
+entry:
+  call void @dispatch(i32 1)
+  call void @dispatch(i32 2)
+  ret void
+}
+
+define amdgpu_kernel void @k1() {
+entry:
+  call void @dispatch(i32 2)
+  call void @dispatch(i32 1)
+  ret void
+}
+
+define amdgpu_kernel void @k2() {
+entry:
+  call void @helperC()
+  ret void
+}