[llvm] [AMDGPU][PromoteAlloca] Whole-function alloca promotion to vector (PR #84735)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 11 02:22:06 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Pierre van Houtryve (Pierre-vh)
<details>
<summary>Changes</summary>
Add a new mode to PromoteAllocaToVector so it considers the whole function before promoting allocas.
- Sort allocas so "high-value" ones are seen first.
- High value = many users, especially users within loops which have more weight to them.
- In this mode, the budget is per function instead of per alloca.
Passed internal performance testing.
---
Full diff: https://github.com/llvm/llvm-project/pull/84735.diff
4 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp (+108-19)
- (modified) llvm/test/CodeGen/AMDGPU/llc-pipeline.ll (+5-5)
- (added) llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll (+72)
- (modified) llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll (+2-2)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index b1b15e9915aea3..582f06a3d755a7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -32,6 +32,7 @@
#include "llvm/Analysis/CaptureTracking.h"
#include "llvm/Analysis/InstSimplifyFolder.h"
#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
@@ -39,6 +40,7 @@
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -64,10 +66,21 @@ static cl::opt<unsigned> PromoteAllocaToVectorLimit(
cl::desc("Maximum byte size to consider promote alloca to vector"),
cl::init(0));
+static cl::opt<bool> PromoteAllocaWholeFn(
+ "promote-alloca-vector-whole-function",
+ cl::desc("Collect, analyze and sort all allocas within a function before promoting them to vector. When disabled, allocas are promoted one by one in isolation"),
+ cl::init(true));
+static cl::opt<unsigned>
+ LoopUserWeight("promote-alloca-vector-loop-user-weight",
+ cl::desc("The bonus weight of users of allocas within loop "
+ "when sorting profitable alloca for whole-function alloca promotion to vector"),
+ cl::init(4));
+
// Shared implementation which can do both promotion to vector and to LDS.
class AMDGPUPromoteAllocaImpl {
private:
const TargetMachine &TM;
+ LoopInfo &LI;
Module *Mod = nullptr;
const DataLayout *DL = nullptr;
@@ -101,8 +114,11 @@ class AMDGPUPromoteAllocaImpl {
bool tryPromoteAllocaToVector(AllocaInst &I);
bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS);
+ void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas);
+
public:
- AMDGPUPromoteAllocaImpl(TargetMachine &TM) : TM(TM) {
+ AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
+
const Triple &TT = TM.getTargetTriple();
IsAMDGCN = TT.getArch() == Triple::amdgcn;
IsAMDHSA = TT.getOS() == Triple::AMDHSA;
@@ -122,7 +138,9 @@ class AMDGPUPromoteAlloca : public FunctionPass {
if (skipFunction(F))
return false;
if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
- return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>())
+ return AMDGPUPromoteAllocaImpl(
+ TPC->getTM<TargetMachine>(),
+ getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
.run(F, /*PromoteToLDS*/ true);
return false;
}
@@ -131,6 +149,7 @@ class AMDGPUPromoteAlloca : public FunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
+ AU.addRequired<LoopInfoWrapperPass>();
FunctionPass::getAnalysisUsage(AU);
}
};
@@ -145,7 +164,9 @@ class AMDGPUPromoteAllocaToVector : public FunctionPass {
if (skipFunction(F))
return false;
if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
- return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>())
+ return AMDGPUPromoteAllocaImpl(
+ TPC->getTM<TargetMachine>(),
+ getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
.run(F, /*PromoteToLDS*/ false);
return false;
}
@@ -156,6 +177,7 @@ class AMDGPUPromoteAllocaToVector : public FunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
+ AU.addRequired<LoopInfoWrapperPass>();
FunctionPass::getAnalysisUsage(AU);
}
};
@@ -186,18 +208,23 @@ INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
// Move LDS uses from functions to kernels before promote alloca for accurate
// estimation of LDS available
INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
"AMDGPU promote alloca to vector or LDS", false, false)
-INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
- "AMDGPU promote alloca to vector", false, false)
+INITIALIZE_PASS_BEGIN(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
+ "AMDGPU promote alloca to vector", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
+ "AMDGPU promote alloca to vector", false, false)
char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
FunctionAnalysisManager &AM) {
- bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F, /*PromoteToLDS*/ true);
+ auto &LI = AM.getResult<LoopAnalysis>(F);
+ bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS*/ true);
if (Changed) {
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
@@ -208,7 +235,8 @@ PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
PreservedAnalyses
AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
- bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F, /*PromoteToLDS*/ false);
+ auto &LI = AM.getResult<LoopAnalysis>(F);
+ bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS*/ false);
if (Changed) {
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
@@ -225,6 +253,49 @@ FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
return new AMDGPUPromoteAllocaToVector();
}
+void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
+ SmallVectorImpl<AllocaInst *> &Allocas) {
+ DenseMap<AllocaInst *, unsigned> Scores;
+
+ LLVM_DEBUG(dbgs() << "Before sorting allocas:\n"; for (auto *A
+ : Allocas) dbgs()
+ << " " << *A << "\n";);
+
+ for (auto *Alloca : Allocas) {
+ LLVM_DEBUG(dbgs() << "Scoring: " << *Alloca << "\n");
+ unsigned &Score = Scores[Alloca];
+ // Increment score by one for each user + a bonus for users within loops.
+ //
+ // Look through GEPs and bitcasts for additional users.
+ SmallVector<User *, 8> WorkList;
+ WorkList.append(Alloca->user_begin(), Alloca->user_end());
+ while (!WorkList.empty()) {
+ auto *Inst = dyn_cast<Instruction>(WorkList.pop_back_val());
+ if (!Inst)
+ continue;
+
+ if (isa<BitCastInst>(Inst) || isa<GetElementPtrInst>(Inst)) {
+ WorkList.append(Inst->user_begin(), Inst->user_end());
+ continue;
+ }
+
+ unsigned UserScore =
+ 1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
+ LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n");
+ Score += UserScore;
+ }
+ LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n");
+ }
+
+ sort(Allocas, [&](AllocaInst *A, AllocaInst *B) {
+ return Scores.at(A) > Scores.at(B);
+ });
+
+ LLVM_DEBUG(dbgs() << "After sorting allocas:\n"; for (auto *A
+ : Allocas) dbgs()
+ << " " << *A << "\n";);
+}
+
bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
Mod = F.getParent();
DL = &Mod->getDataLayout();
@@ -237,6 +308,13 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
bool SufficientLDS = PromoteToLDS ? hasSufficientLocalMem(F) : false;
+ // Use up to 1/4 of available register budget for vectorization.
+ // FIXME: Increase the limit for whole function budgets? Perhaps x2?
+ unsigned VectorizationBudget =
+ (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
+ : (MaxVGPRs * 32)) /
+ 4;
+
SmallVector<AllocaInst *, 16> Allocas;
for (Instruction &I : F.getEntryBlock()) {
if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
@@ -248,11 +326,32 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
}
}
+ if (PromoteAllocaWholeFn)
+ sortAllocasToPromote(Allocas);
+
bool Changed = false;
for (AllocaInst *AI : Allocas) {
- if (tryPromoteAllocaToVector(*AI))
+ const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType());
+ if (AllocaCost > VectorizationBudget) {
+ LLVM_DEBUG(dbgs() << " Alloca too big for vectorization: " << *AI
+ << "\n");
+ return false;
+ }
+
+ if (tryPromoteAllocaToVector(*AI)) {
Changed = true;
- else if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
+ if (PromoteAllocaWholeFn) {
+ // When PromoteAllocaWholeFn is enabled, the budget applies for the
+ // whole function. Otherwise it's per alloca.
+ assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
+ "Underflow!");
+ VectorizationBudget -= AllocaCost;
+ LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
+ << VectorizationBudget << "\n");
+ if (VectorizationBudget == 0)
+ break;
+ }
+ } else if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
Changed = true;
}
@@ -641,16 +740,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
ArrayTy->getNumElements());
}
- // Use up to 1/4 of available register budget for vectorization.
- unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
- : (MaxVGPRs * 32);
-
- if (DL->getTypeSizeInBits(AllocaTy) * 4 > Limit) {
- LLVM_DEBUG(dbgs() << " Alloca too big for vectorization with " << MaxVGPRs
- << " registers available\n");
- return false;
- }
-
// FIXME: There is no reason why we can't support larger arrays, we
// are just being conservative for now.
// FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 5007f77316f5b4..0ff5dd3680dfab 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -195,13 +195,13 @@
; GCN-O1-NEXT: Uniformity Analysis
; GCN-O1-NEXT: AMDGPU atomic optimizations
; GCN-O1-NEXT: Expand Atomic instructions
-; GCN-O1-NEXT: AMDGPU Promote Alloca
; GCN-O1-NEXT: Dominator Tree Construction
+; GCN-O1-NEXT: Natural Loop Information
+; GCN-O1-NEXT: AMDGPU Promote Alloca
; GCN-O1-NEXT: Cycle Info Analysis
; GCN-O1-NEXT: Uniformity Analysis
; GCN-O1-NEXT: AMDGPU IR optimizations
; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl)
-; GCN-O1-NEXT: Natural Loop Information
; GCN-O1-NEXT: Canonicalize natural loops
; GCN-O1-NEXT: Scalar Evolution Analysis
; GCN-O1-NEXT: Loop Pass Manager
@@ -470,9 +470,9 @@
; GCN-O1-OPTS-NEXT: Uniformity Analysis
; GCN-O1-OPTS-NEXT: AMDGPU atomic optimizations
; GCN-O1-OPTS-NEXT: Expand Atomic instructions
-; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
; GCN-O1-OPTS-NEXT: Natural Loop Information
+; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca
; GCN-O1-OPTS-NEXT: Canonicalize natural loops
; GCN-O1-OPTS-NEXT: Lazy Branch Probability Analysis
; GCN-O1-OPTS-NEXT: Lazy Block Frequency Analysis
@@ -775,9 +775,9 @@
; GCN-O2-NEXT: Uniformity Analysis
; GCN-O2-NEXT: AMDGPU atomic optimizations
; GCN-O2-NEXT: Expand Atomic instructions
-; GCN-O2-NEXT: AMDGPU Promote Alloca
; GCN-O2-NEXT: Dominator Tree Construction
; GCN-O2-NEXT: Natural Loop Information
+; GCN-O2-NEXT: AMDGPU Promote Alloca
; GCN-O2-NEXT: Split GEPs to a variadic base and a constant offset for better CSE
; GCN-O2-NEXT: Scalar Evolution Analysis
; GCN-O2-NEXT: Straight line strength reduction
@@ -1084,9 +1084,9 @@
; GCN-O3-NEXT: Uniformity Analysis
; GCN-O3-NEXT: AMDGPU atomic optimizations
; GCN-O3-NEXT: Expand Atomic instructions
-; GCN-O3-NEXT: AMDGPU Promote Alloca
; GCN-O3-NEXT: Dominator Tree Construction
; GCN-O3-NEXT: Natural Loop Information
+; GCN-O3-NEXT: AMDGPU Promote Alloca
; GCN-O3-NEXT: Split GEPs to a variadic base and a constant offset for better CSE
; GCN-O3-NEXT: Scalar Evolution Analysis
; GCN-O3-NEXT: Straight line strength reduction
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll
new file mode 100644
index 00000000000000..49778b22ce4b8a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll
@@ -0,0 +1,72 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -debug-only=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-limit=512 -promote-alloca-vector-whole-function -passes=amdgpu-promote-alloca %s -o - 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; CHECK: Before sorting allocas:
+; CHECK-NEXT: %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT: %manyusers = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT: Scoring: %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT: [+1]: store i32 42, ptr addrspace(5) %simpleuser, align 4
+; CHECK-NEXT: => Final Score:1
+; CHECK-NEXT: Scoring: %manyusers = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT: [+1]: %v0 = load i8, ptr addrspace(5) %manyusers.1, align 1
+; CHECK-NEXT: [+1]: store i32 %v0.ext, ptr addrspace(5) %manyusers.1, align 4
+; CHECK-NEXT: [+1]: %v1 = load i8, ptr addrspace(5) %manyusers.2, align 1
+; CHECK-NEXT: [+1]: store i32 %v1.ext, ptr addrspace(5) %manyusers.2, align 4
+; CHECK-NEXT: => Final Score:4
+; CHECK-NEXT: After sorting allocas:
+; CHECK-NEXT: %manyusers = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT: %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+define amdgpu_kernel void @simple_users_scores() #0 {
+entry:
+ ; should get a score of 1
+ %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+ ; should get a score of 4
+ %manyusers = alloca [4 x i64], align 4, addrspace(5)
+
+ store i32 42, ptr addrspace(5) %simpleuser
+
+ %manyusers.1 = getelementptr i8, ptr addrspace(5) %manyusers, i64 2
+ %v0 = load i8, ptr addrspace(5) %manyusers.1
+ %v0.ext = zext i8 %v0 to i32
+ store i32 %v0.ext, ptr addrspace(5) %manyusers.1
+
+ %manyusers.2 = bitcast ptr addrspace(5) %manyusers to ptr addrspace(5)
+ %v1 = load i8, ptr addrspace(5) %manyusers.2
+ %v1.ext = zext i8 %v0 to i32
+ store i32 %v1.ext, ptr addrspace(5) %manyusers.2
+
+ ret void
+}
+
+; CHECK: Scoring: %stack = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT: [+5]: %outer.cmp = load i1, ptr addrspace(5) %stack.1, align 1
+; CHECK-NEXT: [+9]: store i32 32, ptr addrspace(5) %stack.1, align 4
+; CHECK-NEXT: [+9]: %inner.cmp = load i1, ptr addrspace(5) %stack.2, align 1
+; CHECK-NEXT: [+1]: store i32 64, ptr addrspace(5) %stack.2, align 4
+; CHECK-NEXT: [+1]: store i32 42, ptr addrspace(5) %stack, align 4
+; CHECK-NEXT: [+5]: store i32 32, ptr addrspace(5) %stack, align 4
+; CHECK-NEXT: => Final Score:30
+define amdgpu_kernel void @loop_users_alloca(i1 %x, i2) #0 {
+entry:
+ ; should get a score of 1
+ %stack = alloca [4 x i64], align 4, addrspace(5)
+ %stack.1 = getelementptr i8, ptr addrspace(5) %stack, i64 4
+ %stack.2 = getelementptr i8, ptr addrspace(5) %stack, i64 8
+
+ store i32 42, ptr addrspace(5) %stack
+ br label %loop.outer
+
+loop.outer:
+ store i32 32, ptr addrspace(5) %stack
+ %outer.cmp = load i1, ptr addrspace(5) %stack.1
+ br label %loop.inner
+
+loop.inner:
+ store i32 32, ptr addrspace(5) %stack.1
+ %inner.cmp = load i1, ptr addrspace(5) %stack.2
+ br i1 %inner.cmp, label %loop.inner, label %loop.outer
+
+exit:
+ store i32 64, ptr addrspace(5) %stack.2
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
index 64d7c7868ca8de..c7797d492ca16c 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' -amdgpu-promote-alloca-to-vector-limit=32 < %s | FileCheck -check-prefix=LIMIT32 %s
+; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' -promote-alloca-vector-whole-function=0 < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' -promote-alloca-vector-whole-function=0 -amdgpu-promote-alloca-to-vector-limit=32 < %s | FileCheck -check-prefix=LIMIT32 %s
target datalayout = "A5"
``````````
</details>
https://github.com/llvm/llvm-project/pull/84735
More information about the llvm-commits
mailing list