[PATCH] D110372: [AMDGPU] Limit promote alloca max size in functions

Thu Sep 23 16:10:31 PDT 2021

rampitec created this revision.
rampitec added a reviewer: arsenm.
Herald added subscribers: foad, kerbowa, hiraditya, t-tye, tpr, dstuttard, yaxunl, nhaehnle, jvesely, kzhuravl.
rampitec requested review of this revision.
Herald added a subscriber: wdng.
Herald added a project: LLVM.

Non-entry functions have 32 caller saved VGPRs available. If we
promote alloca to consume more registers we will have to spill
CSRs. There is no reason to eliminate scratch access to get
another scratch access instead.


https://reviews.llvm.org/D110372

Files:
  llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
  llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll


Index: llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
===================================================================

--- llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
+++ llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
@@ -131,6 +131,38 @@
   ret void
 }
 
+; OPT-LABEL: @alloca_9xi64_max256(
+; OPT-NOT: alloca
+; OPT: <9 x i64>
+; LIMIT32: alloca
+; LIMIT32-NOT: <9 x i64>
+define amdgpu_kernel void @alloca_9xi64_max256(i64 addrspace(1)* %out, i32 %index) #2 {
+entry:
+  %tmp = alloca [9 x i64], addrspace(5)
+  %x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0
+  store i64 0, i64 addrspace(5)* %x
+  %tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index
+  %tmp2 = load i64, i64 addrspace(5)* %tmp1
+  store i64 %tmp2, i64 addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @func_alloca_9xi64_max256(
+; OPT: alloca
+; OPT-NOT: <9 x i64>
+; LIMIT32: alloca
+; LIMIT32-NOT: <9 x i64>
+define void @func_alloca_9xi64_max256(i64 addrspace(1)* %out, i32 %index) #2 {
+entry:
+  %tmp = alloca [9 x i64], addrspace(5)
+  %x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0
+  store i64 0, i64 addrspace(5)* %x
+  %tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index
+  %tmp2 = load i64, i64 addrspace(5)* %tmp1
+  store i64 %tmp2, i64 addrspace(1)* %out
+  ret void
+}
+
 attributes #0 = { "amdgpu-flat-work-group-size"="1,1024" }
 attributes #1 = { "amdgpu-flat-work-group-size"="1,512" }
 attributes #2 = { "amdgpu-flat-work-group-size"="1,256" }
Index: llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/IntrinsicsR600.h"
 #include "llvm/Pass.h"
 #include "llvm/Target/TargetMachine.h"
+#include "Utils/AMDGPULDSUtils.h"
 
 #define DEBUG_TYPE "amdgpu-promote-alloca"
 
@@ -176,6 +177,10 @@
   if (IsAMDGCN) {
     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
     MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
+    // A non-entry function has only 32 caller preserved registers.
+    // Do not promote alloca which will force spilling.
+    if (!AMDGPU::isKernelCC(&F))
+      MaxVGPRs = std::min(MaxVGPRs, 32u);
   } else {
     MaxVGPRs = 128;
   }
@@ -1107,6 +1112,10 @@
   if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
     MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
+    // A non-entry function has only 32 caller preserved registers.
+    // Do not promote alloca which will force spilling.
+    if (!AMDGPU::isKernelCC(&F))
+      MaxVGPRs = std::min(MaxVGPRs, 32u);
   } else {
     MaxVGPRs = 128;
   }


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D110372.374688.patch
Type: text/x-patch
Size: 2840 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20210923/14191cfa/attachment-0001.bin>