[llvm] dd1df09 - [InlineCost][TargetTransformInfo][AMDGPU] Consider cost of alloca instructions in the caller (2/2)

Thu Jun 29 00:51:49 PDT 2023

Author: Juan Manuel MARTINEZ CAAMAÑO
Date: 2023-06-29T09:49:16+02:00
New Revision: dd1df099ae37c2a68a20c96615b7ed7474eacec4

URL: https://github.com/llvm/llvm-project/commit/dd1df099ae37c2a68a20c96615b7ed7474eacec4
DIFF: https://github.com/llvm/llvm-project/commit/dd1df099ae37c2a68a20c96615b7ed7474eacec4.diff

LOG: [InlineCost][TargetTransformInfo][AMDGPU] Consider cost of alloca instructions in the caller (2/2)

Before this patch, the compiler gave a bump to the inline-threshold
when the total size of the allocas passed as arguments to the
callee was below 256 bytes.
This heuristic ignores that some of these allocas could have be removed
by SROA if inlining was applied.

Ideally, this bonus would be attributed to the threshold once the
size of all the allocas that could not be handled by SROA is known:
at the end of the InlineCost analysis.
However, we may never reach this point if the inline-cost analysis exits
early when the inline cost goes over the threshold mid-analysis.

This patch proposes:
* Attribute the bonus in the inline-threshold when allocas are passed
  as arguments (regardless of their total size).
* Assigns a cost to each alloca proportional to its size,
  such that the cost of all the allocas cancels the bonus.

Potential problems:
* This patch assumes that removing alloca instructions with SROA is
  always profitable. This may not be the case if the total size of the
  allocas is still too big to be promoted to registers/LDS.
* Redundant calls to getTotalAllocaSize
* Awkwardly, the threshold attributed contributes to the single-bb and
  vector bonus.

Reviewed By: scchan

Differential Revision: https://reviews.llvm.org/D149741

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
    llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
    llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll
    llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument-cost.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index bec4d30fb9f03f..1fa0a025c6226d 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1219,36 +1219,83 @@ static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB,
   return adjustThreshold;
 }
 
-unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
-  // If we have a pointer to private array passed into a function
+static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
+                                           const DataLayout &DL) {
+  // If we have a pointer to a private array passed into a function
   // it will not be optimized out, leaving scratch usage.
-  // Increase the inline threshold to allow inlining in this case.
-  unsigned adjustThreshold = 0;
-  uint64_t AllocaSize = 0;
+  // This function calculates the total size in bytes of the memory that would
+  // end in scratch if the call was not inlined.
+  unsigned AllocaSize = 0;
   SmallPtrSet<const AllocaInst *, 8> AIVisited;
   for (Value *PtrArg : CB->args()) {
     PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
-    if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
-                Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS))
+    if (!Ty)
       continue;
 
-    PtrArg = getUnderlyingObject(PtrArg);
-    if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
-      if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
-        continue;
-      AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
-      // If the amount of stack memory is excessive we will not be able
-      // to get rid of the scratch anyway, bail out.
-      if (AllocaSize > ArgAllocaCutoff) {
-        AllocaSize = 0;
-        break;
-      }
-    }
+    unsigned AddrSpace = Ty->getAddressSpace();
+    if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
+        AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
+      continue;
+
+    const AllocaInst *AI = dyn_cast<AllocaInst>(getUnderlyingObject(PtrArg));
+    if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second)
+      continue;
+
+    AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
   }
-  adjustThreshold +=
-      adjustInliningThresholdUsingCallee(CB, TLI, this);
-  adjustThreshold += AllocaSize ? ArgAllocaCost : AllocaSize;
-  return adjustThreshold;
+  return AllocaSize;
+}
+
+unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
+  unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this);
+
+  // Private object passed as arguments may end up in scratch usage if the call
+  // is not inlined. Increase the inline threshold to promote inlining.
+  unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
+  if (AllocaSize > 0)
+    Threshold += ArgAllocaCost;
+  return Threshold;
+}
+
+unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB,
+                                         const AllocaInst *AI) const {
+
+  // Below the cutoff, assume that the private memory objects would be
+  // optimized
+  auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
+  if (AllocaSize <= ArgAllocaCutoff)
+    return 0;
+
+  // Above the cutoff, we give a cost to each private memory object
+  // depending its size. If the array can be optimized by SROA this cost is not
+  // added to the total-cost in the inliner cost analysis.
+  //
+  // We choose the total cost of the alloca such that their sum cancels the
+  // bonus given in the threshold (ArgAllocaCost).
+  //
+  //   Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
+  //
+  // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
+  // the single-bb bonus and the vector-bonus.
+  //
+  // We compensate the first two multipliers, by repeating logic from the
+  // inliner-cost in here. The vector-bonus is 0 on AMDGPU.
+  static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");
+  unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
+
+  bool SingleBB = none_of(*CB->getCalledFunction(), [](const BasicBlock &BB) {
+    return BB.getTerminator()->getNumSuccessors() > 1;
+  });
+  if (SingleBB) {
+    Threshold += Threshold / 2;
+  }
+
+  auto ArgAllocaSize = DL.getTypeAllocSize(AI->getAllocatedType());
+
+  // Attribute the bonus proportionally to the alloca size
+  unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize;
+
+  return AllocaThresholdBonus;
 }
 
 void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index db223e1272a239..7bbf414b2de16d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -71,6 +71,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
   bool IsGraphics;
   bool HasFP32Denormals;
   bool HasFP64FP16Denormals;
+  static constexpr bool InlinerVectorBonusPercent = 0;
 
   static const FeatureBitset InlineFeatureIgnoreList;
 
@@ -240,8 +241,9 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
 
   unsigned getInliningThresholdMultiplier() const { return 11; }
   unsigned adjustInliningThreshold(const CallBase *CB) const;
+  unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const;
 
-  int getInlinerVectorBonusPercent() const { return 0; }
+  int getInlinerVectorBonusPercent() const { return InlinerVectorBonusPercent; }
 
   InstructionCost getArithmeticReductionCost(
       unsigned Opcode, VectorType *Ty, std::optional<FastMathFlags> FMF,

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll
index 9bd0e3907bdf66..b34df3ffca2642 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll
@@ -30,6 +30,8 @@ if.end:                                           ; preds = %if.then, %entry
 
 define coldcc void @foo_private_ptr2(ptr addrspace(5) nocapture %p1, ptr addrspace(5) nocapture %p2) {
 entry:
+  call void @forbid_sroa(ptr addrspace(5) %p1)
+  call void @forbid_sroa(ptr addrspace(5) %p2)
   %tmp1 = load float, ptr addrspace(5) %p1, align 4
   %cmp = fcmp ogt float %tmp1, 1.000000e+00
   br i1 %cmp, label %if.then, label %if.end
@@ -171,6 +173,7 @@ bb.2:
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare float @_Z3sinf(float) #1
+declare void @forbid_sroa(ptr addrspace(5) nocapture %p)
 
 attributes #0 = { noinline }
 attributes #1 = { nounwind readnone }

diff  --git a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument-cost.ll b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument-cost.ll
index a11861c11342b9..fbaf64fce5f44c 100644
--- a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument-cost.ll
+++ b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument-cost.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mtriple=amdgcn--amdhsa -S -passes=inline -inline-threshold=0 -debug-only=inline-cost < %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple=amdgcn--amdhsa -S -passes=inline -inline-threshold=0 -debug-only=inline-cost %s 2>&1 | FileCheck %s
 
 ; REQUIRES: asserts
 
@@ -6,16 +6,91 @@ target datalayout = "A5"
 
 ; Verify we are properly adding cost of the -amdgpu-inline-arg-alloca-cost to the threshold.
 
+define void @local_access_only(ptr addrspace(5) %p, i32 %idx) {
+  %arrayidx = getelementptr inbounds [64 x float], ptr addrspace(5) %p, i32 0, i32 %idx
+  %value = load float, ptr addrspace(5) %arrayidx
+  store float %value , ptr addrspace(5) %arrayidx, align 4
+  ret void
+}
+
+; Below the cutoff, the alloca cost is 0, and only the cost of the instructions saved by sroa is counted
+; CHECK: Analyzing call of local_access_only... (caller:test_inliner_sroa_single_below_cutoff)
+; CHECK: NumAllocaArgs: 1
+; CHECK: SROACostSavings: 10
+; CHECK: SROACostSavingsLost: 0
+; CHECK: Threshold: 66000
+define amdgpu_kernel void @test_inliner_sroa_single_below_cutoff(ptr addrspace(1) %a, i32 %n) {
+entry:
+  %pvt_arr = alloca [64 x float], align 4, addrspace(5)
+  call void @local_access_only(ptr addrspace(5) %pvt_arr, i32 4)
+  ret void
+}
+
+; Above the cutoff, attribute a cost to the alloca
+; CHECK: Analyzing call of local_access_only... (caller:test_inliner_sroa_single_above_cutoff)
 ; CHECK: NumAllocaArgs: 1
+; CHECK: SROACostSavings: 66010
+; CHECK: SROACostSavingsLost: 0
+; CHECK: Threshold: 66000
+define amdgpu_kernel void @test_inliner_sroa_single_above_cutoff(ptr addrspace(1) %a, i32 %n) {
+entry:
+  %pvt_arr = alloca [65 x float], align 4, addrspace(5)
+  call void @local_access_only(ptr addrspace(5) %pvt_arr, i32 4)
+  ret void
+}
+
+define void @use_first_externally(ptr addrspace(5) %p1, ptr addrspace(5) %p2) {
+  call void @external(ptr addrspace(5) %p1)
+  %arrayidx = getelementptr inbounds [64 x float], ptr addrspace(5) %p2, i32 0, i32 7
+  %value = load float, ptr addrspace(5) %arrayidx
+  store float %value , ptr addrspace(5) %arrayidx, align 4
+  ret void
+}
+
+define void @use_both_externally(ptr addrspace(5) %p1, ptr addrspace(5) %p2) {
+  call void @external(ptr addrspace(5) %p1)
+  call void @external(ptr addrspace(5) %p2)
+  ret void
+}
+
+; One array cannot get handled by SROA 
+; CHECK: Analyzing call of use_first_externally... (caller:test_inliner_sroa_double)
+; CHECK: NumAllocaArgs: 2
+; CHECK: SROACostSavings: 32502
+; CHECK: SROACostSavingsLost: 33507
 ; CHECK: Threshold: 66000
+define amdgpu_kernel void @test_inliner_sroa_double() {
+entry:
+  %pvt_arr1 = alloca [33 x float], align 4, addrspace(5)
+  %pvt_arr2 = alloca [32 x float], align 4, addrspace(5)
+  call void @use_first_externally(ptr addrspace(5) %pvt_arr1, ptr addrspace(5) %pvt_arr2)
+  ret void
+}
 
-define void @use_private_ptr_arg(ptr addrspace(5) nocapture %p) {
+; The two arrays cannot get handled by SROA 
+; CHECK: Analyzing call of use_both_externally... (caller:test_inliner_no_sroa)
+; CHECK: NumAllocaArgs: 2
+; CHECK: SROACostSavings: 0
+; CHECK: SROACostSavingsLost: 65999
+; CHECK: Threshold: 66000
+define amdgpu_kernel void @test_inliner_no_sroa() {
+entry:
+  %pvt_arr1 = alloca [33 x float], align 4, addrspace(5)
+  %pvt_arr2 = alloca [32 x float], align 4, addrspace(5)
+  call void @use_both_externally(ptr addrspace(5) %pvt_arr1, ptr addrspace(5) %pvt_arr2)
   ret void
 }
 
-define amdgpu_kernel void @test_inliner_pvt_ptr(ptr addrspace(1) nocapture %a, i32 %n) {
+; No private arrays
+; CHECK: Analyzing call of use_both_externally... (caller:test_inliner_no_alloc)
+; CHECK: NumAllocaArgs: 0
+; CHECK: SROACostSavings: 0
+; CHECK: SROACostSavingsLost: 0
+; CHECK: Threshold: 0
+define amdgpu_kernel void @test_inliner_no_alloc(ptr addrspace(5) %a, ptr addrspace(5) %b) {
 entry:
-  %pvt_arr = alloca [64 x float], align 4, addrspace(5)
-  call void @use_private_ptr_arg(ptr addrspace(5) %pvt_arr)
+  call void @use_both_externally(ptr addrspace(5) %a, ptr addrspace(5) %b)
   ret void
 }
+
+declare void @external(ptr addrspace(5) %p)