[llvm] cf74ef1 - [AMDGPU] Limit promote alloca max size in functions

Fri Sep 24 13:38:45 PDT 2021

Author: Stanislav Mekhanoshin
Date: 2021-09-24T13:38:39-07:00
New Revision: cf74ef134c9a8089d8997144d54628952c6d4552

URL: https://github.com/llvm/llvm-project/commit/cf74ef134c9a8089d8997144d54628952c6d4552
DIFF: https://github.com/llvm/llvm-project/commit/cf74ef134c9a8089d8997144d54628952c6d4552.diff

LOG: [AMDGPU] Limit promote alloca max size in functions

Non-entry functions have 32 caller saved VGPRs available. If we
promote alloca to consume more registers we will have to spill
CSRs. There is no reason to eliminate scratch access to get
another scratch access instead.

Differential Revision: https://reviews.llvm.org/D110372

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
    llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 737713ba8762..de1af4371140 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/IntrinsicsR600.h"
 #include "llvm/Pass.h"
 #include "llvm/Target/TargetMachine.h"
+#include "Utils/AMDGPUBaseInfo.h"
 
 #define DEBUG_TYPE "amdgpu-promote-alloca"
 
@@ -176,6 +177,10 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F) {
   if (IsAMDGCN) {
     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
     MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
+    // A non-entry function has only 32 caller preserved registers.
+    // Do not promote alloca which will force spilling.
+    if (!AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+      MaxVGPRs = std::min(MaxVGPRs, 32u);
   } else {
     MaxVGPRs = 128;
   }
@@ -1107,6 +1112,10 @@ bool promoteAllocasToVector(Function &F, TargetMachine &TM) {
   if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
     MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
+    // A non-entry function has only 32 caller preserved registers.
+    // Do not promote alloca which will force spilling.
+    if (!AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+      MaxVGPRs = std::min(MaxVGPRs, 32u);
   } else {
     MaxVGPRs = 128;
   }

diff  --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
index 6235b9e3b16a..c1cf6cf1a031 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
@@ -131,6 +131,38 @@ entry:
   ret void
 }
 
+; OPT-LABEL: @alloca_9xi64_max256(
+; OPT-NOT: alloca
+; OPT: <9 x i64>
+; LIMIT32: alloca
+; LIMIT32-NOT: <9 x i64>
+define amdgpu_kernel void @alloca_9xi64_max256(i64 addrspace(1)* %out, i32 %index) #2 {
+entry:
+  %tmp = alloca [9 x i64], addrspace(5)
+  %x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0
+  store i64 0, i64 addrspace(5)* %x
+  %tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index
+  %tmp2 = load i64, i64 addrspace(5)* %tmp1
+  store i64 %tmp2, i64 addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @func_alloca_9xi64_max256(
+; OPT: alloca
+; OPT-NOT: <9 x i64>
+; LIMIT32: alloca
+; LIMIT32-NOT: <9 x i64>
+define void @func_alloca_9xi64_max256(i64 addrspace(1)* %out, i32 %index) #2 {
+entry:
+  %tmp = alloca [9 x i64], addrspace(5)
+  %x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0
+  store i64 0, i64 addrspace(5)* %x
+  %tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index
+  %tmp2 = load i64, i64 addrspace(5)* %tmp1
+  store i64 %tmp2, i64 addrspace(1)* %out
+  ret void
+}
+
 attributes #0 = { "amdgpu-flat-work-group-size"="1,1024" }
 attributes #1 = { "amdgpu-flat-work-group-size"="1,512" }
 attributes #2 = { "amdgpu-flat-work-group-size"="1,256" }