[llvm] bc78c09 - [amdgpu] Elide module lds allocation in kernels with no callees

Wed May 4 14:42:22 PDT 2022

Author: Jon Chesterfield
Date: 2022-05-04T22:42:07+01:00
New Revision: bc78c099524283b5de44517ee5fbb805d09a7cdc

URL: https://github.com/llvm/llvm-project/commit/bc78c099524283b5de44517ee5fbb805d09a7cdc
DIFF: https://github.com/llvm/llvm-project/commit/bc78c099524283b5de44517ee5fbb805d09a7cdc.diff

LOG: [amdgpu] Elide module lds allocation in kernels with no callees

Introduces a string attribute, amdgpu-requires-module-lds, to allow
eliding the module.lds block from kernels. Will allocate the block as before
if the attribute is missing or has its default value of true.

Patch uses the new attribute to detect the simplest possible instance of this,
where a kernel makes no calls and thus cannot call any functions that use LDS.

Tests updated to match, coverage was already good. Interesting cases is in
lower-module-lds-offsets where annotating the kernel allows the backend to pick
a different (in this case better) variable ordering than previously. A later
patch will avoid moving kernel variables into module.lds when the kernel can
have this attribute, allowing optimal ordering and locally unused variable
elimination.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D122091

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
    llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
    llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
    llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
    llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
    llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
    llvm/test/CodeGen/AMDGPU/lower-module-lds.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index b64aa41855a49..f6eecf4e250e5 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -498,7 +498,7 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
   const DataLayout &DL = F.getParent()->getDataLayout();
 
-  Info->allocateModuleLDSGlobal(F.getParent());
+  Info->allocateModuleLDSGlobal(F);
 
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
@@ -583,7 +583,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(
   const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
   const DataLayout &DL = F.getParent()->getDataLayout();
 
-  Info->allocateModuleLDSGlobal(F.getParent());
+  Info->allocateModuleLDSGlobal(F);
 
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 75cfd124cd070..35922341de26e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -30,6 +30,7 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "Utils/AMDGPUMemoryUtils.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/CallGraph.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
@@ -163,9 +164,10 @@ class AMDGPULowerModuleLDS : public ModulePass {
   }
 
   bool runOnModule(Module &M) override {
+    CallGraph CG = CallGraph(M);
     UsedList = getUsedList(M);
     bool Changed = superAlignLDSGlobals(M);
-    Changed |= processUsedLDS(M);
+    Changed |= processUsedLDS(CG, M);
 
     for (Function &F : M.functions()) {
       if (F.isDeclaration())
@@ -174,7 +176,7 @@ class AMDGPULowerModuleLDS : public ModulePass {
       // Only lower compute kernels' LDS.
       if (!AMDGPU::isKernel(F.getCallingConv()))
         continue;
-      Changed |= processUsedLDS(M, &F);
+      Changed |= processUsedLDS(CG, M, &F);
     }
 
     UsedList.clear();
@@ -226,7 +228,7 @@ class AMDGPULowerModuleLDS : public ModulePass {
     return Changed;
   }
 
-  bool processUsedLDS(Module &M, Function *F = nullptr) {
+  bool processUsedLDS(CallGraph const &CG, Module &M, Function *F = nullptr) {
     LLVMContext &Ctx = M.getContext();
     const DataLayout &DL = M.getDataLayout();
 
@@ -374,7 +376,20 @@ class AMDGPULowerModuleLDS : public ModulePass {
       IRBuilder<> Builder(Ctx);
       for (Function &Func : M.functions()) {
         if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) {
-          markUsedByKernel(Builder, &Func, SGV);
+          const CallGraphNode *N = CG[&Func];
+          const bool CalleesRequireModuleLDS = N->size() > 0;
+
+          if (CalleesRequireModuleLDS) {
+            // If a function this kernel might call requires module LDS,
+            // annotate the kernel to let later passes know it will allocate
+            // this structure, even if not apparent from the IR.
+            markUsedByKernel(Builder, &Func, SGV);
+          } else {
+            // However if we are certain this kernel cannot call a function that
+            // requires module LDS, annotate the kernel so the backend can elide
+            // the allocation without repeating callgraph walks.
+            Func.addFnAttr("amdgpu-elide-module-lds");
+          }
         }
       }
     }

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 4fb44856f8979..b461c3c4bfdcd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -83,10 +83,16 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
   return Offset;
 }
 
-void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Module *M) {
+// This kernel calls no functions that require the module lds struct
+static bool canElideModuleLDS(const Function &F) {
+  return F.hasFnAttribute("amdgpu-elide-module-lds");
+}
+
+void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Function &F) {
+  const Module *M = F.getParent();
   if (isModuleEntryFunction()) {
     const GlobalVariable *GV = M->getNamedGlobal("llvm.amdgcn.module.lds");
-    if (GV) {
+    if (GV && !canElideModuleLDS(F)) {
       unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV);
       (void)Offset;
       assert(Offset == 0 &&

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 3afa2c3e54776..6c0763804cf47 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -102,7 +102,7 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
   }
 
   unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV);
-  void allocateModuleLDSGlobal(const Module *M);
+  void allocateModuleLDSGlobal(const Function &F);
 
   Align getDynLDSAlign() const { return DynLDSAlign; }
 

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a8310c2a0c275..22f800118adfd 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2447,7 +2447,7 @@ SDValue SITargetLowering::LowerFormalArguments(
     return DAG.getEntryNode();
   }
 
-  Info->allocateModuleLDSGlobal(Fn.getParent());
+  Info->allocateModuleLDSGlobal(Fn);
 
   SmallVector<ISD::InputArg, 16> Splits;
   SmallVector<CCValAssign, 16> ArgLocs;

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
index bcb7a62c09398..69e14724d3d8c 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
@@ -21,9 +21,8 @@
 ; CHECK: @llvm.amdgcn.kernel..lds = internal addrspace(3) global %llvm.amdgcn.kernel..lds.t undef, align 2
 ; CHECK: @llvm.amdgcn.kernel..lds.1 = internal addrspace(3) global %llvm.amdgcn.kernel..lds.t.0 undef, align 4
 ;.
-define amdgpu_kernel void @k0() {
+define amdgpu_kernel void @k0() #0 {
 ; CHECK-LABEL: @k0(
-; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
 ; CHECK-NEXT:    %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i8 addrspace(3)*
 ; CHECK-NEXT:    store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 8
 ; CHECK-NEXT:    %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2) to i8 addrspace(3)*
@@ -49,9 +48,8 @@ define amdgpu_kernel void @k0() {
   ret void
 }
 
-define amdgpu_kernel void @k1() {
+define amdgpu_kernel void @k1() #0 {
 ; CHECK-LABEL: @k1(
-; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
 ; CHECK-NEXT:    %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2) to i8 addrspace(3)*
 ; CHECK-NEXT:    store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 4
 ; CHECK-NEXT:    %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1) to i8 addrspace(3)*
@@ -72,9 +70,8 @@ define amdgpu_kernel void @k1() {
   ret void
 }
 
-define amdgpu_kernel void @0() {
+define amdgpu_kernel void @0() #0 {
 ; CHECK-LABEL: @0(
-; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
 ; CHECK-NEXT:    %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel..lds.t, %llvm.amdgcn.kernel..lds.t addrspace(3)* @llvm.amdgcn.kernel..lds, i32 0, i32 0) to i8 addrspace(3)*
 ; CHECK-NEXT:    store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
 ; CHECK-NEXT:    ret void
@@ -85,9 +82,8 @@ define amdgpu_kernel void @0() {
   ret void
 }
 
-define amdgpu_kernel void @1() {
+define amdgpu_kernel void @1() #0 {
 ; CHECK-LABEL: @1(
-; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
 ; CHECK-NEXT:    %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel..lds.t.0, %llvm.amdgcn.kernel..lds.t.0 addrspace(3)* @llvm.amdgcn.kernel..lds.1, i32 0, i32 0) to i8 addrspace(3)*
 ; CHECK-NEXT:    store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
 ; CHECK-NEXT:    ret void
@@ -114,6 +110,6 @@ define void @f0() {
 
   ret void
 }
-;.
-; CHECK: attributes #0 = { nocallback nofree nosync nounwind readnone willreturn }
-;.
+
+attributes #0 = { "amdgpu-elide-module-lds" }
+; CHECK: attributes #0 = { "amdgpu-elide-module-lds" }

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
index f7070a6adcf9f..37334b2ac81e4 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
@@ -35,8 +35,8 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @timestwo()
-; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
+; CHECK-LABEL: @timestwo() #0
+; CHECK-NOT: call void @llvm.donothing()
 ; CHECK: %1 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)*
 ; CHECK: %2 = addrspacecast i32 addrspace(3)* %1 to i32*
 ; CHECK: %3 = ptrtoint i32* %2 to i64
@@ -56,3 +56,6 @@ define amdgpu_kernel void @timestwo() {
   store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
   ret void
 }
+
+attributes #0 = { "amdgpu-elide-module-lds" }
+; CHECK: attributes #0 = { "amdgpu-elide-module-lds" }

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
index 92b26d36d4b8d..a1effa6e3eb43 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; Check that module LDS is allocated at address 0 and kernel starts its
-; allocation past module LDS.
+; allocation past module LDS when a call is present.
 
 @lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1
 @lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16
@@ -22,12 +22,14 @@ define amdgpu_kernel void @k0() {
 ; OPT-NEXT:    store i8 1, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1
 ; OPT-NEXT:    [[LDS_SIZE_16_ALIGN_16_BC:%.*]] = bitcast [16 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K0_LDS_T:%.*]], [[LLVM_AMDGCN_KERNEL_K0_LDS_T]] addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)*
 ; OPT-NEXT:    store i8 2, i8 addrspace(3)* [[LDS_SIZE_16_ALIGN_16_BC]], align 16
+; OPT-NEXT:    call void @f0()
 ; OPT-NEXT:    ret void
 ;
   %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*
   store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
   %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
   store i8 2, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
+  call void @f0()
   ret void
 }
 
@@ -36,7 +38,7 @@ define amdgpu_kernel void @k0() {
 ; GCN-DAG: v_mov_b32_e32 [[TREE:v[0-9]+]], 3
 ; GCN:     ds_write_b8 [[NULL]], [[TREE]]
 define void @f0() {
-; OPT-LABEL: @f0(
+; OPT-LABEL: @f0() {
 ; OPT-NEXT:    [[LDS_SIZE_1_ALIGN_1_BC:%.*]] = bitcast [1 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)*
 ; OPT-NEXT:    store i8 3, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1
 ; OPT-NEXT:    ret void
@@ -45,3 +47,6 @@ define void @f0() {
   store i8 3, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
   ret void
 }
+
+attributes #0 = { "amdgpu-elide-module-lds" }
+; CHECK: attributes #0 = { "amdgpu-elide-module-lds" }

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll
index a620eb88a602c..8609ec370029b 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll
@@ -48,13 +48,16 @@ define amdgpu_kernel void @kern_call() {
   ret void
 }
 
-; This kernel does not need to alloc the LDS block as it makes no calls
+; This kernel does alloc the LDS block as it makes no calls
 ; CHECK-LABEL: @kern_empty()
-; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
-define spir_kernel void @kern_empty() {
+; CHECK-NOT: call void @llvm.donothing()
+define spir_kernel void @kern_empty() #0{
   ret void
 }
 
 ; Make sure we don't crash trying to insert code into a kernel
 ; declaration.
 declare amdgpu_kernel void @kernel_declaration()
+
+attributes #0 = { "amdgpu-elide-module-lds" }
+; CHECK: attributes #0 = { "amdgpu-elide-module-lds" }