[llvm] fd1d608 - [AMDGPU] Remove CC exception for Promote Alloca Limits

Wed Apr 12 23:48:39 PDT 2023

Author: pvanhout
Date: 2023-04-13T08:48:34+02:00
New Revision: fd1d60873fdce6e908c9865ddf925f2616fccd55

URL: https://github.com/llvm/llvm-project/commit/fd1d60873fdce6e908c9865ddf925f2616fccd55
DIFF: https://github.com/llvm/llvm-project/commit/fd1d60873fdce6e908c9865ddf925f2616fccd55.diff

LOG: [AMDGPU] Remove CC exception for Promote Alloca Limits

Apparently it was used to work around some issue that has been fixed.
Removing it helps with high scratch usage observed in some cases due to failed alloca promotion.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D145586

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
    llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 2fe5fbebf7c19..27392aba20de1 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -124,6 +124,14 @@ class AMDGPUPromoteAllocaToVector : public FunctionPass {
   }
 };
 
+unsigned getMaxVGPRs(const TargetMachine &TM, const Function &F) {
+  if (!TM.getTargetTriple().isAMDGCN())
+    return 128;
+
+  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+  return ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
+}
+
 } // end anonymous namespace
 
 char AMDGPUPromoteAlloca::ID = 0;
@@ -176,16 +184,7 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F) {
   if (!ST.isPromoteAllocaEnabled())
     return false;
 
-  if (IsAMDGCN) {
-    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
-    MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
-    // A non-entry function has only 32 caller preserved registers.
-    // Do not promote alloca which will force spilling.
-    if (!AMDGPU::isEntryFunctionCC(F.getCallingConv()))
-      MaxVGPRs = std::min(MaxVGPRs, 32u);
-  } else {
-    MaxVGPRs = 128;
-  }
+  MaxVGPRs = getMaxVGPRs(TM, F);
 
   bool SufficientLDS = hasSufficientLocalMem(F);
   bool Changed = false;
@@ -1200,17 +1199,7 @@ bool promoteAllocasToVector(Function &F, TargetMachine &TM) {
   if (!ST.isPromoteAllocaEnabled())
     return false;
 
-  unsigned MaxVGPRs;
-  if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
-    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
-    MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
-    // A non-entry function has only 32 caller preserved registers.
-    // Do not promote alloca which will force spilling.
-    if (!AMDGPU::isEntryFunctionCC(F.getCallingConv()))
-      MaxVGPRs = std::min(MaxVGPRs, 32u);
-  } else {
-    MaxVGPRs = 128;
-  }
+  const unsigned MaxVGPRs = getMaxVGPRs(TM, F);
 
   bool Changed = false;
   BasicBlock &EntryBB = *F.begin();

diff  --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
index 3afc22a5fac2e..dccf1c7021a37 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
@@ -139,8 +139,8 @@ entry:
 }
 
 ; OPT-LABEL: @func_alloca_9xi64_max256(
-; OPT: alloca
-; OPT-NOT: <9 x i64>
+; OPT-NOT: alloca
+; OPT: <9 x i64>
 ; LIMIT32: alloca
 ; LIMIT32-NOT: <9 x i64>
 define void @func_alloca_9xi64_max256(ptr addrspace(1) %out, i32 %index) #2 {