[llvm] 54e2dc7 - [AMDGPU] Limit promote alloca to vector with VGPR budget

Wed Jul 1 15:57:33 PDT 2020

Author: Stanislav Mekhanoshin
Date: 2020-07-01T15:57:24-07:00
New Revision: 54e2dc7537dd62a70d76883197e3007cadf060aa

URL: https://github.com/llvm/llvm-project/commit/54e2dc7537dd62a70d76883197e3007cadf060aa
DIFF: https://github.com/llvm/llvm-project/commit/54e2dc7537dd62a70d76883197e3007cadf060aa.diff

LOG: [AMDGPU] Limit promote alloca to vector with VGPR budget

Allow only up to 1/4 of available VGPRs for the vectorization
of any given alloca.

Differential Revision: https://reviews.llvm.org/D82990

Added: 
    llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index f719403cb8e5..727f71b35049 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -76,6 +76,11 @@ static cl::opt<bool> DisablePromoteAllocaToLDS(
   cl::desc("Disable promote alloca to LDS"),
   cl::init(false));
 
+static cl::opt<unsigned> PromoteAllocaToVectorLimit(
+  "amdgpu-promote-alloca-to-vector-limit",
+  cl::desc("Maximum byte size to consider promote alloca to vector"),
+  cl::init(0));
+
 // FIXME: This can create globals so should be a module pass.
 class AMDGPUPromoteAlloca : public FunctionPass {
 private:
@@ -86,6 +91,7 @@ class AMDGPUPromoteAlloca : public FunctionPass {
   // FIXME: This should be per-kernel.
   uint32_t LocalMemLimit = 0;
   uint32_t CurrentLocalMemUsage = 0;
+  unsigned MaxVGPRs;
 
   bool IsAMDGCN = false;
   bool IsAMDHSA = false;
@@ -129,6 +135,9 @@ class AMDGPUPromoteAlloca : public FunctionPass {
 };
 
 class AMDGPUPromoteAllocaToVector : public FunctionPass {
+private:
+  unsigned MaxVGPRs;
+
 public:
   static char ID;
 
@@ -186,6 +195,13 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
   if (!ST.isPromoteAllocaEnabled())
     return false;
 
+  if (IsAMDGCN) {
+    const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+    MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
+  } else {
+    MaxVGPRs = 128;
+  }
+
   bool SufficientLDS = hasSufficientLocalMem(F);
   bool Changed = false;
   BasicBlock &EntryBB = *F.begin();
@@ -409,7 +425,8 @@ static bool canVectorizeInst(Instruction *Inst, User *User,
   }
 }
 
-static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL) {
+static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
+                                     unsigned MaxVGPRs) {
 
   if (DisablePromoteAllocaToVector) {
     LLVM_DEBUG(dbgs() << "  Promotion alloca to vector is disabled\n");
@@ -424,6 +441,16 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL) {
       VectorTy = arrayTypeToVecType(ArrayTy);
   }
 
+  // Use up to 1/4 of available register budget for vectorization.
+  unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
+                                              : (MaxVGPRs * 32);
+
+  if (DL.getTypeSizeInBits(AllocaTy) * 4 > Limit) {
+    LLVM_DEBUG(dbgs() << "  Alloca too big for vectorization with "
+                      << MaxVGPRs << " registers available\n");
+    return false;
+  }
+
   LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
 
   // FIXME: There is no reason why we can't support larger arrays, we
@@ -806,7 +833,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
 
   LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
 
-  if (tryPromoteAllocaToVector(&I, DL))
+  if (tryPromoteAllocaToVector(&I, DL, MaxVGPRs))
     return true; // Promoted to vector.
 
   if (DisablePromoteAllocaToLDS)
@@ -1016,6 +1043,23 @@ bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
   if (skipFunction(F) || DisablePromoteAllocaToVector)
     return false;
 
+  const TargetMachine *TM;
+  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
+    TM = &TPC->getTM<TargetMachine>();
+  else
+    return false;
+
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
+  if (!ST.isPromoteAllocaEnabled())
+    return false;
+
+  if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
+    const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+    MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
+  } else {
+    MaxVGPRs = 128;
+  }
+
   bool Changed = false;
   BasicBlock &EntryBB = *F.begin();
 
@@ -1042,7 +1086,7 @@ bool AMDGPUPromoteAllocaToVector::handleAlloca(AllocaInst &I) {
   LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
 
   Module *Mod = I.getParent()->getParent()->getParent();
-  return tryPromoteAllocaToVector(&I, Mod->getDataLayout());
+  return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs);
 }
 
 FunctionPass *llvm::createAMDGPUPromoteAlloca() {

diff  --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
new file mode 100644
index 000000000000..6235b9e3b16a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
@@ -0,0 +1,136 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine -amdgpu-promote-alloca-to-vector-limit=32 < %s | FileCheck -check-prefix=LIMIT32 %s
+
+target datalayout = "A5"
+
+; OPT-LABEL: @alloca_8xi64_max1024(
+; OPT-NOT: alloca
+; OPT: <8 x i64>
+; LIMIT32: alloca
+; LIMIT32-NOT: <8 x i64>
+define amdgpu_kernel void @alloca_8xi64_max1024(i64 addrspace(1)* %out, i32 %index) #0 {
+entry:
+  %tmp = alloca [8 x i64], addrspace(5)
+  %x = getelementptr [8 x i64], [8 x i64] addrspace(5)* %tmp, i32 0, i32 0
+  store i64 0, i64 addrspace(5)* %x
+  %tmp1 = getelementptr [8 x i64], [8 x i64] addrspace(5)* %tmp, i32 0, i32 %index
+  %tmp2 = load i64, i64 addrspace(5)* %tmp1
+  store i64 %tmp2, i64 addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @alloca_9xi64_max1024(
+; OPT: alloca [9 x i64]
+; OPT-NOT: <9 x i64>
+; LIMIT32: alloca
+; LIMIT32-NOT: <9 x i64>
+define amdgpu_kernel void @alloca_9xi64_max1024(i64 addrspace(1)* %out, i32 %index) #0 {
+entry:
+  %tmp = alloca [9 x i64], addrspace(5)
+  %x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0
+  store i64 0, i64 addrspace(5)* %x
+  %tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index
+  %tmp2 = load i64, i64 addrspace(5)* %tmp1
+  store i64 %tmp2, i64 addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @alloca_16xi64_max512(
+; OPT-NOT: alloca
+; OPT: <16 x i64>
+; LIMIT32: alloca
+; LIMIT32-NOT: <16 x i64>
+define amdgpu_kernel void @alloca_16xi64_max512(i64 addrspace(1)* %out, i32 %index) #1 {
+entry:
+  %tmp = alloca [16 x i64], addrspace(5)
+  %x = getelementptr [16 x i64], [16 x i64] addrspace(5)* %tmp, i32 0, i32 0
+  store i64 0, i64 addrspace(5)* %x
+  %tmp1 = getelementptr [16 x i64], [16 x i64] addrspace(5)* %tmp, i32 0, i32 %index
+  %tmp2 = load i64, i64 addrspace(5)* %tmp1
+  store i64 %tmp2, i64 addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @alloca_17xi64_max512(
+; OPT: alloca [17 x i64]
+; OPT-NOT: <17 x i64>
+; LIMIT32: alloca
+; LIMIT32-NOT: <17 x i64>
+define amdgpu_kernel void @alloca_17xi64_max512(i64 addrspace(1)* %out, i32 %index) #1 {
+entry:
+  %tmp = alloca [17 x i64], addrspace(5)
+  %x = getelementptr [17 x i64], [17 x i64] addrspace(5)* %tmp, i32 0, i32 0
+  store i64 0, i64 addrspace(5)* %x
+  %tmp1 = getelementptr [17 x i64], [17 x i64] addrspace(5)* %tmp, i32 0, i32 %index
+  %tmp2 = load i64, i64 addrspace(5)* %tmp1
+  store i64 %tmp2, i64 addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @alloca_9xi128_max512(
+; OPT: alloca [9 x i128]
+; OPT-NOT: <9 x i128>
+; LIMIT32: alloca
+; LIMIT32-NOT: <9 x i128>
+define amdgpu_kernel void @alloca_9xi128_max512(i128 addrspace(1)* %out, i32 %index) #1 {
+entry:
+  %tmp = alloca [9 x i128], addrspace(5)
+  %x = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 0
+  store i128 0, i128 addrspace(5)* %x
+  %tmp1 = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 %index
+  %tmp2 = load i128, i128 addrspace(5)* %tmp1
+  store i128 %tmp2, i128 addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @alloca_9xi128_max256(
+; OPT-NOT: alloca
+; OPT: <9 x i128>
+; LIMIT32: alloca
+; LIMIT32-NOT: <9 x i128>
+define amdgpu_kernel void @alloca_9xi128_max256(i128 addrspace(1)* %out, i32 %index) #2 {
+entry:
+  %tmp = alloca [9 x i128], addrspace(5)
+  %x = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 0
+  store i128 0, i128 addrspace(5)* %x
+  %tmp1 = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 %index
+  %tmp2 = load i128, i128 addrspace(5)* %tmp1
+  store i128 %tmp2, i128 addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @alloca_16xi128_max256(
+; OPT-NOT: alloca
+; OPT: <16 x i128>
+; LIMIT32: alloca
+; LIMIT32-NOT: <16 x i128>
+define amdgpu_kernel void @alloca_16xi128_max256(i128 addrspace(1)* %out, i32 %index) #2 {
+entry:
+  %tmp = alloca [16 x i128], addrspace(5)
+  %x = getelementptr [16 x i128], [16 x i128] addrspace(5)* %tmp, i32 0, i32 0
+  store i128 0, i128 addrspace(5)* %x
+  %tmp1 = getelementptr [16 x i128], [16 x i128] addrspace(5)* %tmp, i32 0, i32 %index
+  %tmp2 = load i128, i128 addrspace(5)* %tmp1
+  store i128 %tmp2, i128 addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @alloca_9xi256_max256(
+; OPT: alloca [9 x i256]
+; OPT-NOT: <9 x i256>
+; LIMIT32: alloca
+; LIMIT32-NOT: <9 x i256>
+define amdgpu_kernel void @alloca_9xi256_max256(i256 addrspace(1)* %out, i32 %index) #2 {
+entry:
+  %tmp = alloca [9 x i256], addrspace(5)
+  %x = getelementptr [9 x i256], [9 x i256] addrspace(5)* %tmp, i32 0, i32 0
+  store i256 0, i256 addrspace(5)* %x
+  %tmp1 = getelementptr [9 x i256], [9 x i256] addrspace(5)* %tmp, i32 0, i32 %index
+  %tmp2 = load i256, i256 addrspace(5)* %tmp1
+  store i256 %tmp2, i256 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="1,1024" }
+attributes #1 = { "amdgpu-flat-work-group-size"="1,512" }
+attributes #2 = { "amdgpu-flat-work-group-size"="1,256" }