[llvm] 21d2884 - AMDGPU: Annotate functions that have stack objects

Tue May 19 15:51:11 PDT 2020

Author: Matt Arsenault
Date: 2020-05-19T18:51:00-04:00
New Revision: 21d2884a9c5b4227a3e7f3220e4398aab1e49e1c

URL: https://github.com/llvm/llvm-project/commit/21d2884a9c5b4227a3e7f3220e4398aab1e49e1c
DIFF: https://github.com/llvm/llvm-project/commit/21d2884a9c5b4227a3e7f3220e4398aab1e49e1c.diff

LOG: AMDGPU: Annotate functions that have stack objects

Relying on any MachineFunction state in the MachineFunctionInfo
constructor is hazardous, because the construction time is unclear and
determined by the first use. The function may be only partially
constructed, which is part of why we have many of these hacky string
attributes to track what we need for ABI lowering.

For SelectionDAG, all stack objects are created up-front before
calling convention lowering so stack objects are visible at
construction time. For GlobalISel, none of the IR function has been
visited yet and the allocas haven't been added to the MachineFrameInfo
yet. This should fix failing to set flat_scratch_init in GlobalISel
when needed.

This pass really needs to be turned into some kind of analysis, but I
haven't found a nice way use one here.

Added: 
    llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
    llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
    llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index d241b4899b43..625074569cfa 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -279,6 +279,7 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
   bool HasApertureRegs = ST.hasApertureRegs();
   SmallPtrSet<const Constant *, 8> ConstantExprVisited;
 
+  bool HaveStackObjects = false;
   bool Changed = false;
   bool NeedQueuePtr = false;
   bool HaveCall = false;
@@ -286,6 +287,11 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
 
   for (BasicBlock &BB : F) {
     for (Instruction &I : BB) {
+      if (isa<AllocaInst>(I)) {
+        HaveStackObjects = true;
+        continue;
+      }
+
       if (auto *CB = dyn_cast<CallBase>(&I)) {
         const Function *Callee =
             dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
@@ -355,6 +361,11 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
     Changed = true;
   }
 
+  if (HaveStackObjects) {
+    F.addFnAttr("amdgpu-stack-objects");
+    Changed = true;
+  }
+
   return Changed;
 }
 

diff  --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 06681471bf90..2a3ba523f8c2 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -55,11 +55,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
 
   Occupancy = ST.computeOccupancy(MF, getLDSSize());
   CallingConv::ID CC = F.getCallingConv();
-  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
 
   // FIXME: Should have analysis or something rather than attribute to detect
   // calls.
-  const bool HasCalls = FrameInfo.hasCalls() || F.hasFnAttribute("amdgpu-calls");
+  const bool HasCalls = F.hasFnAttribute("amdgpu-calls");
 
   // Enable all kernel inputs if we have the fixed ABI. Don't bother if we don't
   // have any calls.
@@ -125,8 +124,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
       WorkItemIDZ = true;
   }
 
-  bool HasStackObjects = FrameInfo.hasStackObjects();
-
+  bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");
   if (isEntryFunction()) {
     // X, XY, and XYZ are the only supported combinations, so make sure Y is
     // enabled if Z is.
@@ -170,20 +168,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     KernargSegmentPtr = true;
 
   if (ST.hasFlatAddressSpace() && isEntryFunction() && isAmdHsaOrMesa) {
-    auto hasNonSpillStackObjects = [&]() {
-      // Avoid expensive checking if there's no stack objects.
-      if (!HasStackObjects)
-        return false;
-      for (auto OI = FrameInfo.getObjectIndexBegin(),
-                OE = FrameInfo.getObjectIndexEnd(); OI != OE; ++OI)
-        if (!FrameInfo.isSpillSlotObjectIndex(OI))
-          return true;
-      // All stack objects are spill slots.
-      return false;
-    };
     // TODO: This could be refined a lot. The attribute is a poor way of
-    // detecting calls that may require it before argument lowering.
-    if (HasCalls || hasNonSpillStackObjects())
+    // detecting calls or stack objects that may require it before argument
+    // lowering.
+    if (HasCalls || HasStackObjects)
       FlatScratchInit = true;
   }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
new file mode 100644
index 000000000000..39029e359889
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
+
+; Make sure flat_scratch_init is set
+
+; GCN-LABEL: {{^}}stack_object_addrspacecast_in_kernel_no_calls:
+; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
+define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() {
+  %alloca = alloca i32, addrspace(5)
+  %cast = addrspacecast i32 addrspace(5)* %alloca to i32*
+  store volatile i32 0, i32* %cast
+  ret void
+}
+
+; TODO: Could optimize out in this case
+; GCN-LABEL: {{^}}stack_object_in_kernel_no_calls:
+; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
+define amdgpu_kernel void @stack_object_in_kernel_no_calls() {
+  %alloca = alloca i32, addrspace(5)
+  store volatile i32 0, i32 addrspace(5)* %alloca
+  ret void
+}
+
+; GCN-LABEL: {{^}}kernel_no_calls_no_stack:
+; GCN: .amdhsa_user_sgpr_flat_scratch_init 0
+define amdgpu_kernel void @kernel_no_calls_no_stack() {
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index 7efd007195e0..3e24c17834a5 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefix=HSA %s
 
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
 declare i32 @llvm.amdgcn.workgroup.id.x() #0
 declare i32 @llvm.amdgcn.workgroup.id.y() #0
 declare i32 @llvm.amdgcn.workgroup.id.z() #0
@@ -250,6 +252,31 @@ define amdgpu_kernel void @use_is_private(i8* %ptr) #1 {
   ret void
 }
 
+; HSA: define amdgpu_kernel void @use_alloca() #13 {
+define amdgpu_kernel void @use_alloca() #1 {
+  %alloca = alloca i32, addrspace(5)
+  store i32 0, i32 addrspace(5)* %alloca
+  ret void
+}
+
+; HSA: define amdgpu_kernel void @use_alloca_non_entry_block() #13 {
+define amdgpu_kernel void @use_alloca_non_entry_block() #1 {
+entry:
+  br label %bb
+
+bb:
+  %alloca = alloca i32, addrspace(5)
+  store i32 0, i32 addrspace(5)* %alloca
+  ret void
+}
+
+; HSA: define void @use_alloca_func() #13 {
+define void @use_alloca_func() #1 {
+  %alloca = alloca i32, addrspace(5)
+  store i32 0, i32 addrspace(5)* %alloca
+  ret void
+}
+
 attributes #0 = { nounwind readnone speculatable }
 attributes #1 = { nounwind }
 
@@ -266,3 +293,4 @@ attributes #1 = { nounwind }
 ; HSA: attributes #10 = { nounwind "amdgpu-dispatch-ptr" }
 ; HSA: attributes #11 = { nounwind "amdgpu-queue-ptr" }
 ; HSA: attributes #12 = { nounwind "amdgpu-kernarg-segment-ptr" }
+; HSA: attributes #13 = { nounwind "amdgpu-stack-objects" }