[llvm] 54e2dc7 - [AMDGPU] Limit promote alloca to vector with VGPR budget
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 1 15:57:33 PDT 2020
Author: Stanislav Mekhanoshin
Date: 2020-07-01T15:57:24-07:00
New Revision: 54e2dc7537dd62a70d76883197e3007cadf060aa
URL: https://github.com/llvm/llvm-project/commit/54e2dc7537dd62a70d76883197e3007cadf060aa
DIFF: https://github.com/llvm/llvm-project/commit/54e2dc7537dd62a70d76883197e3007cadf060aa.diff
LOG: [AMDGPU] Limit promote alloca to vector with VGPR budget
Allow only up to 1/4 of available VGPRs for the vectorization
of any given alloca.
Differential Revision: https://reviews.llvm.org/D82990
Added:
llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index f719403cb8e5..727f71b35049 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -76,6 +76,11 @@ static cl::opt<bool> DisablePromoteAllocaToLDS(
cl::desc("Disable promote alloca to LDS"),
cl::init(false));
+static cl::opt<unsigned> PromoteAllocaToVectorLimit(
+ "amdgpu-promote-alloca-to-vector-limit",
+ cl::desc("Maximum byte size to consider promote alloca to vector"),
+ cl::init(0));
+
// FIXME: This can create globals so should be a module pass.
class AMDGPUPromoteAlloca : public FunctionPass {
private:
@@ -86,6 +91,7 @@ class AMDGPUPromoteAlloca : public FunctionPass {
// FIXME: This should be per-kernel.
uint32_t LocalMemLimit = 0;
uint32_t CurrentLocalMemUsage = 0;
+ unsigned MaxVGPRs;
bool IsAMDGCN = false;
bool IsAMDHSA = false;
@@ -129,6 +135,9 @@ class AMDGPUPromoteAlloca : public FunctionPass {
};
class AMDGPUPromoteAllocaToVector : public FunctionPass {
+private:
+ unsigned MaxVGPRs;
+
public:
static char ID;
@@ -186,6 +195,13 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
if (!ST.isPromoteAllocaEnabled())
return false;
+ if (IsAMDGCN) {
+ const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+ MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
+ } else {
+ MaxVGPRs = 128;
+ }
+
bool SufficientLDS = hasSufficientLocalMem(F);
bool Changed = false;
BasicBlock &EntryBB = *F.begin();
@@ -409,7 +425,8 @@ static bool canVectorizeInst(Instruction *Inst, User *User,
}
}
-static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL) {
+static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
+ unsigned MaxVGPRs) {
if (DisablePromoteAllocaToVector) {
LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n");
@@ -424,6 +441,16 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL) {
VectorTy = arrayTypeToVecType(ArrayTy);
}
+ // Use up to 1/4 of available register budget for vectorization.
+ unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
+ : (MaxVGPRs * 32);
+
+ if (DL.getTypeSizeInBits(AllocaTy) * 4 > Limit) {
+ LLVM_DEBUG(dbgs() << " Alloca too big for vectorization with "
+ << MaxVGPRs << " registers available\n");
+ return false;
+ }
+
LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
// FIXME: There is no reason why we can't support larger arrays, we
@@ -806,7 +833,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
- if (tryPromoteAllocaToVector(&I, DL))
+ if (tryPromoteAllocaToVector(&I, DL, MaxVGPRs))
return true; // Promoted to vector.
if (DisablePromoteAllocaToLDS)
@@ -1016,6 +1043,23 @@ bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
if (skipFunction(F) || DisablePromoteAllocaToVector)
return false;
+ const TargetMachine *TM;
+ if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
+ TM = &TPC->getTM<TargetMachine>();
+ else
+ return false;
+
+ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
+ if (!ST.isPromoteAllocaEnabled())
+ return false;
+
+ if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
+ const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+ MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
+ } else {
+ MaxVGPRs = 128;
+ }
+
bool Changed = false;
BasicBlock &EntryBB = *F.begin();
@@ -1042,7 +1086,7 @@ bool AMDGPUPromoteAllocaToVector::handleAlloca(AllocaInst &I) {
LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
Module *Mod = I.getParent()->getParent()->getParent();
- return tryPromoteAllocaToVector(&I, Mod->getDataLayout());
+ return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs);
}
FunctionPass *llvm::createAMDGPUPromoteAlloca() {
diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
new file mode 100644
index 000000000000..6235b9e3b16a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
@@ -0,0 +1,136 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine -amdgpu-promote-alloca-to-vector-limit=32 < %s | FileCheck -check-prefix=LIMIT32 %s
+
+target datalayout = "A5"
+
+; OPT-LABEL: @alloca_8xi64_max1024(
+; OPT-NOT: alloca
+; OPT: <8 x i64>
+; LIMIT32: alloca
+; LIMIT32-NOT: <8 x i64>
+define amdgpu_kernel void @alloca_8xi64_max1024(i64 addrspace(1)* %out, i32 %index) #0 {
+entry:
+ %tmp = alloca [8 x i64], addrspace(5)
+ %x = getelementptr [8 x i64], [8 x i64] addrspace(5)* %tmp, i32 0, i32 0
+ store i64 0, i64 addrspace(5)* %x
+ %tmp1 = getelementptr [8 x i64], [8 x i64] addrspace(5)* %tmp, i32 0, i32 %index
+ %tmp2 = load i64, i64 addrspace(5)* %tmp1
+ store i64 %tmp2, i64 addrspace(1)* %out
+ ret void
+}
+
+; OPT-LABEL: @alloca_9xi64_max1024(
+; OPT: alloca [9 x i64]
+; OPT-NOT: <9 x i64>
+; LIMIT32: alloca
+; LIMIT32-NOT: <9 x i64>
+define amdgpu_kernel void @alloca_9xi64_max1024(i64 addrspace(1)* %out, i32 %index) #0 {
+entry:
+ %tmp = alloca [9 x i64], addrspace(5)
+ %x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0
+ store i64 0, i64 addrspace(5)* %x
+ %tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index
+ %tmp2 = load i64, i64 addrspace(5)* %tmp1
+ store i64 %tmp2, i64 addrspace(1)* %out
+ ret void
+}
+
+; OPT-LABEL: @alloca_16xi64_max512(
+; OPT-NOT: alloca
+; OPT: <16 x i64>
+; LIMIT32: alloca
+; LIMIT32-NOT: <16 x i64>
+define amdgpu_kernel void @alloca_16xi64_max512(i64 addrspace(1)* %out, i32 %index) #1 {
+entry:
+ %tmp = alloca [16 x i64], addrspace(5)
+ %x = getelementptr [16 x i64], [16 x i64] addrspace(5)* %tmp, i32 0, i32 0
+ store i64 0, i64 addrspace(5)* %x
+ %tmp1 = getelementptr [16 x i64], [16 x i64] addrspace(5)* %tmp, i32 0, i32 %index
+ %tmp2 = load i64, i64 addrspace(5)* %tmp1
+ store i64 %tmp2, i64 addrspace(1)* %out
+ ret void
+}
+
+; OPT-LABEL: @alloca_17xi64_max512(
+; OPT: alloca [17 x i64]
+; OPT-NOT: <17 x i64>
+; LIMIT32: alloca
+; LIMIT32-NOT: <17 x i64>
+define amdgpu_kernel void @alloca_17xi64_max512(i64 addrspace(1)* %out, i32 %index) #1 {
+entry:
+ %tmp = alloca [17 x i64], addrspace(5)
+ %x = getelementptr [17 x i64], [17 x i64] addrspace(5)* %tmp, i32 0, i32 0
+ store i64 0, i64 addrspace(5)* %x
+ %tmp1 = getelementptr [17 x i64], [17 x i64] addrspace(5)* %tmp, i32 0, i32 %index
+ %tmp2 = load i64, i64 addrspace(5)* %tmp1
+ store i64 %tmp2, i64 addrspace(1)* %out
+ ret void
+}
+
+; OPT-LABEL: @alloca_9xi128_max512(
+; OPT: alloca [9 x i128]
+; OPT-NOT: <9 x i128>
+; LIMIT32: alloca
+; LIMIT32-NOT: <9 x i128>
+define amdgpu_kernel void @alloca_9xi128_max512(i128 addrspace(1)* %out, i32 %index) #1 {
+entry:
+ %tmp = alloca [9 x i128], addrspace(5)
+ %x = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 0
+ store i128 0, i128 addrspace(5)* %x
+ %tmp1 = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 %index
+ %tmp2 = load i128, i128 addrspace(5)* %tmp1
+ store i128 %tmp2, i128 addrspace(1)* %out
+ ret void
+}
+
+; OPT-LABEL: @alloca_9xi128_max256(
+; OPT-NOT: alloca
+; OPT: <9 x i128>
+; LIMIT32: alloca
+; LIMIT32-NOT: <9 x i128>
+define amdgpu_kernel void @alloca_9xi128_max256(i128 addrspace(1)* %out, i32 %index) #2 {
+entry:
+ %tmp = alloca [9 x i128], addrspace(5)
+ %x = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 0
+ store i128 0, i128 addrspace(5)* %x
+ %tmp1 = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 %index
+ %tmp2 = load i128, i128 addrspace(5)* %tmp1
+ store i128 %tmp2, i128 addrspace(1)* %out
+ ret void
+}
+
+; OPT-LABEL: @alloca_16xi128_max256(
+; OPT-NOT: alloca
+; OPT: <16 x i128>
+; LIMIT32: alloca
+; LIMIT32-NOT: <16 x i128>
+define amdgpu_kernel void @alloca_16xi128_max256(i128 addrspace(1)* %out, i32 %index) #2 {
+entry:
+ %tmp = alloca [16 x i128], addrspace(5)
+ %x = getelementptr [16 x i128], [16 x i128] addrspace(5)* %tmp, i32 0, i32 0
+ store i128 0, i128 addrspace(5)* %x
+ %tmp1 = getelementptr [16 x i128], [16 x i128] addrspace(5)* %tmp, i32 0, i32 %index
+ %tmp2 = load i128, i128 addrspace(5)* %tmp1
+ store i128 %tmp2, i128 addrspace(1)* %out
+ ret void
+}
+
+; OPT-LABEL: @alloca_9xi256_max256(
+; OPT: alloca [9 x i256]
+; OPT-NOT: <9 x i256>
+; LIMIT32: alloca
+; LIMIT32-NOT: <9 x i256>
+define amdgpu_kernel void @alloca_9xi256_max256(i256 addrspace(1)* %out, i32 %index) #2 {
+entry:
+ %tmp = alloca [9 x i256], addrspace(5)
+ %x = getelementptr [9 x i256], [9 x i256] addrspace(5)* %tmp, i32 0, i32 0
+ store i256 0, i256 addrspace(5)* %x
+ %tmp1 = getelementptr [9 x i256], [9 x i256] addrspace(5)* %tmp, i32 0, i32 %index
+ %tmp2 = load i256, i256 addrspace(5)* %tmp1
+ store i256 %tmp2, i256 addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="1,1024" }
+attributes #1 = { "amdgpu-flat-work-group-size"="1,512" }
+attributes #2 = { "amdgpu-flat-work-group-size"="1,256" }
More information about the llvm-commits
mailing list