[llvm] 953c13b - [AMDGPU][PromoteAlloca] Whole-function alloca promotion to vector (#84735)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 19 03:49:25 PDT 2024
Author: Pierre van Houtryve
Date: 2024-03-19T11:49:22+01:00
New Revision: 953c13b5c90bed1e24fe95e90137c4e226ac2d09
URL: https://github.com/llvm/llvm-project/commit/953c13b5c90bed1e24fe95e90137c4e226ac2d09
DIFF: https://github.com/llvm/llvm-project/commit/953c13b5c90bed1e24fe95e90137c4e226ac2d09.diff
LOG: [AMDGPU][PromoteAlloca] Whole-function alloca promotion to vector (#84735)
Update PromoteAllocaToVector so it considers the whole function before promoting allocas.
Allocas are scored & sorted so the highest value ones are seen first. The budget is now per function instead of per alloca.
Passed internal performance testing.
Added:
llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index e86132769f5788..6f3cdf54dceec7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -32,6 +32,7 @@
#include "llvm/Analysis/CaptureTracking.h"
#include "llvm/Analysis/InstSimplifyFolder.h"
#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
@@ -39,6 +40,7 @@
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -64,10 +66,17 @@ static cl::opt<unsigned> PromoteAllocaToVectorLimit(
cl::desc("Maximum byte size to consider promote alloca to vector"),
cl::init(0));
+static cl::opt<unsigned>
+ LoopUserWeight("promote-alloca-vector-loop-user-weight",
+ cl::desc("The bonus weight of users of allocas within loop "
+ "when sorting profitable allocas"),
+ cl::init(4));
+
// Shared implementation which can do both promotion to vector and to LDS.
class AMDGPUPromoteAllocaImpl {
private:
const TargetMachine &TM;
+ LoopInfo &LI;
Module *Mod = nullptr;
const DataLayout *DL = nullptr;
@@ -101,8 +110,11 @@ class AMDGPUPromoteAllocaImpl {
bool tryPromoteAllocaToVector(AllocaInst &I);
bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS);
+ void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas);
+
public:
- AMDGPUPromoteAllocaImpl(TargetMachine &TM) : TM(TM) {
+ AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
+
const Triple &TT = TM.getTargetTriple();
IsAMDGCN = TT.getArch() == Triple::amdgcn;
IsAMDHSA = TT.getOS() == Triple::AMDHSA;
@@ -122,7 +134,9 @@ class AMDGPUPromoteAlloca : public FunctionPass {
if (skipFunction(F))
return false;
if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
- return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>())
+ return AMDGPUPromoteAllocaImpl(
+ TPC->getTM<TargetMachine>(),
+ getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
.run(F, /*PromoteToLDS*/ true);
return false;
}
@@ -131,6 +145,7 @@ class AMDGPUPromoteAlloca : public FunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
+ AU.addRequired<LoopInfoWrapperPass>();
FunctionPass::getAnalysisUsage(AU);
}
};
@@ -145,7 +160,9 @@ class AMDGPUPromoteAllocaToVector : public FunctionPass {
if (skipFunction(F))
return false;
if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
- return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>())
+ return AMDGPUPromoteAllocaImpl(
+ TPC->getTM<TargetMachine>(),
+ getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
.run(F, /*PromoteToLDS*/ false);
return false;
}
@@ -156,6 +173,7 @@ class AMDGPUPromoteAllocaToVector : public FunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
+ AU.addRequired<LoopInfoWrapperPass>();
FunctionPass::getAnalysisUsage(AU);
}
};
@@ -186,18 +204,23 @@ INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
// Move LDS uses from functions to kernels before promote alloca for accurate
// estimation of LDS available
INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
"AMDGPU promote alloca to vector or LDS", false, false)
-INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
- "AMDGPU promote alloca to vector", false, false)
+INITIALIZE_PASS_BEGIN(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
+ "AMDGPU promote alloca to vector", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
+ "AMDGPU promote alloca to vector", false, false)
char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
FunctionAnalysisManager &AM) {
- bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F, /*PromoteToLDS*/ true);
+ auto &LI = AM.getResult<LoopAnalysis>(F);
+ bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true);
if (Changed) {
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
@@ -208,7 +231,8 @@ PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
PreservedAnalyses
AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
- bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F, /*PromoteToLDS*/ false);
+ auto &LI = AM.getResult<LoopAnalysis>(F);
+ bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false);
if (Changed) {
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
@@ -225,6 +249,55 @@ FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
return new AMDGPUPromoteAllocaToVector();
}
+static void collectAllocaUses(AllocaInst &Alloca,
+ SmallVectorImpl<Use *> &Uses) {
+ SmallVector<Instruction *, 4> WorkList({&Alloca});
+ while (!WorkList.empty()) {
+ auto *Cur = WorkList.pop_back_val();
+ for (auto &U : Cur->uses()) {
+ Uses.push_back(&U);
+
+ if (isa<GetElementPtrInst>(U.getUser()))
+ WorkList.push_back(cast<Instruction>(U.getUser()));
+ }
+ }
+}
+
+void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
+ SmallVectorImpl<AllocaInst *> &Allocas) {
+ DenseMap<AllocaInst *, unsigned> Scores;
+
+ for (auto *Alloca : Allocas) {
+ LLVM_DEBUG(dbgs() << "Scoring: " << *Alloca << "\n");
+ unsigned &Score = Scores[Alloca];
+ // Increment score by one for each user + a bonus for users within loops.
+ SmallVector<Use *, 8> Uses;
+ collectAllocaUses(*Alloca, Uses);
+ for (auto *U : Uses) {
+ Instruction *Inst = cast<Instruction>(U->getUser());
+ if (isa<GetElementPtrInst>(Inst))
+ continue;
+ unsigned UserScore =
+ 1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
+ LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n");
+ Score += UserScore;
+ }
+ LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n");
+ }
+
+ stable_sort(Allocas, [&](AllocaInst *A, AllocaInst *B) {
+ return Scores.at(A) > Scores.at(B);
+ });
+
+ // clang-format off
+ LLVM_DEBUG(
+ dbgs() << "Sorted Worklist:\n";
+ for (auto *A: Allocas)
+ dbgs() << " " << *A << "\n";
+ );
+ // clang-format on
+}
+
bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
Mod = F.getParent();
DL = &Mod->getDataLayout();
@@ -237,6 +310,13 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
bool SufficientLDS = PromoteToLDS ? hasSufficientLocalMem(F) : false;
+ // Use up to 1/4 of available register budget for vectorization.
+ // FIXME: Increase the limit for whole function budgets? Perhaps x2?
+ unsigned VectorizationBudget =
+ (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
+ : (MaxVGPRs * 32)) /
+ 4;
+
SmallVector<AllocaInst *, 16> Allocas;
for (Instruction &I : F.getEntryBlock()) {
if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
@@ -248,11 +328,27 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
}
}
+ sortAllocasToPromote(Allocas);
+
bool Changed = false;
for (AllocaInst *AI : Allocas) {
- if (tryPromoteAllocaToVector(*AI))
+ const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType());
+ if (AllocaCost > VectorizationBudget) {
+ LLVM_DEBUG(dbgs() << " Alloca too big for vectorization: " << *AI
+ << "\n");
+ return false;
+ }
+
+ if (tryPromoteAllocaToVector(*AI)) {
Changed = true;
- else if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
+ assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
+ "Underflow!");
+ VectorizationBudget -= AllocaCost;
+ LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
+ << VectorizationBudget << "\n");
+ if (VectorizationBudget == 0)
+ break;
+ } else if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
Changed = true;
}
@@ -641,16 +737,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
ArrayTy->getNumElements());
}
- // Use up to 1/4 of available register budget for vectorization.
- unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
- : (MaxVGPRs * 32);
-
- if (DL->getTypeSizeInBits(AllocaTy) * 4 > Limit) {
- LLVM_DEBUG(dbgs() << " Alloca too big for vectorization with " << MaxVGPRs
- << " registers available\n");
- return false;
- }
-
// FIXME: There is no reason why we can't support larger arrays, we
// are just being conservative for now.
// FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or
@@ -671,7 +757,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
SmallVector<Instruction *> WorkList;
SmallVector<Instruction *> UsersToRemove;
SmallVector<Instruction *> DeferredInsts;
- SmallVector<Use *, 8> Uses;
DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
@@ -680,15 +765,14 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
return false;
};
- for (Use &U : Alloca.uses())
- Uses.push_back(&U);
+ SmallVector<Use *, 8> Uses;
+ collectAllocaUses(Alloca, Uses);
LLVM_DEBUG(dbgs() << " Attempting promotion to: " << *VectorTy << "\n");
Type *VecEltTy = VectorTy->getElementType();
unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
- while (!Uses.empty()) {
- Use *U = Uses.pop_back_val();
+ for (auto *U : Uses) {
Instruction *Inst = cast<Instruction>(U->getUser());
if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
@@ -732,8 +816,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
return RejectUser(Inst, "cannot compute vector index for GEP");
GEPVectorIdx[GEP] = Index;
- for (Use &U : Inst->uses())
- Uses.push_back(&U);
UsersToRemove.push_back(Inst);
continue;
}
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 5007f77316f5b4..0ff5dd3680dfab 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -195,13 +195,13 @@
; GCN-O1-NEXT: Uniformity Analysis
; GCN-O1-NEXT: AMDGPU atomic optimizations
; GCN-O1-NEXT: Expand Atomic instructions
-; GCN-O1-NEXT: AMDGPU Promote Alloca
; GCN-O1-NEXT: Dominator Tree Construction
+; GCN-O1-NEXT: Natural Loop Information
+; GCN-O1-NEXT: AMDGPU Promote Alloca
; GCN-O1-NEXT: Cycle Info Analysis
; GCN-O1-NEXT: Uniformity Analysis
; GCN-O1-NEXT: AMDGPU IR optimizations
; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl)
-; GCN-O1-NEXT: Natural Loop Information
; GCN-O1-NEXT: Canonicalize natural loops
; GCN-O1-NEXT: Scalar Evolution Analysis
; GCN-O1-NEXT: Loop Pass Manager
@@ -470,9 +470,9 @@
; GCN-O1-OPTS-NEXT: Uniformity Analysis
; GCN-O1-OPTS-NEXT: AMDGPU atomic optimizations
; GCN-O1-OPTS-NEXT: Expand Atomic instructions
-; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
; GCN-O1-OPTS-NEXT: Natural Loop Information
+; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca
; GCN-O1-OPTS-NEXT: Canonicalize natural loops
; GCN-O1-OPTS-NEXT: Lazy Branch Probability Analysis
; GCN-O1-OPTS-NEXT: Lazy Block Frequency Analysis
@@ -775,9 +775,9 @@
; GCN-O2-NEXT: Uniformity Analysis
; GCN-O2-NEXT: AMDGPU atomic optimizations
; GCN-O2-NEXT: Expand Atomic instructions
-; GCN-O2-NEXT: AMDGPU Promote Alloca
; GCN-O2-NEXT: Dominator Tree Construction
; GCN-O2-NEXT: Natural Loop Information
+; GCN-O2-NEXT: AMDGPU Promote Alloca
; GCN-O2-NEXT: Split GEPs to a variadic base and a constant offset for better CSE
; GCN-O2-NEXT: Scalar Evolution Analysis
; GCN-O2-NEXT: Straight line strength reduction
@@ -1084,9 +1084,9 @@
; GCN-O3-NEXT: Uniformity Analysis
; GCN-O3-NEXT: AMDGPU atomic optimizations
; GCN-O3-NEXT: Expand Atomic instructions
-; GCN-O3-NEXT: AMDGPU Promote Alloca
; GCN-O3-NEXT: Dominator Tree Construction
; GCN-O3-NEXT: Natural Loop Information
+; GCN-O3-NEXT: AMDGPU Promote Alloca
; GCN-O3-NEXT: Split GEPs to a variadic base and a constant offset for better CSE
; GCN-O3-NEXT: Scalar Evolution Analysis
; GCN-O3-NEXT: Straight line strength reduction
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll
new file mode 100644
index 00000000000000..ab03177d1edc51
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll
@@ -0,0 +1,69 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -debug-only=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-limit=512 -passes=amdgpu-promote-alloca %s -o - 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; CHECK: Scoring: %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT: [+1]: store i32 42, ptr addrspace(5) %simpleuser, align 4
+; CHECK-NEXT: => Final Score:1
+; CHECK-NEXT: Scoring: %manyusers = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT: [+1]: store i32 %v0.ext, ptr addrspace(5) %manyusers.1, align 4
+; CHECK-NEXT: [+1]: %v0 = load i8, ptr addrspace(5) %manyusers.1, align 1
+; CHECK-NEXT: [+1]: store i32 %v1.ext, ptr addrspace(5) %manyusers.2, align 4
+; CHECK-NEXT: [+1]: %v1 = load i8, ptr addrspace(5) %manyusers.2, align 1
+; CHECK-NEXT: => Final Score:4
+; CHECK-NEXT: Sorted Worklist:
+; CHECK-NEXT: %manyusers = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT: %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+define amdgpu_kernel void @simple_users_scores() #0 {
+entry:
+ ; should get a score of 1
+ %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+ ; should get a score of 4
+ %manyusers = alloca [4 x i64], align 4, addrspace(5)
+
+ store i32 42, ptr addrspace(5) %simpleuser
+
+ %manyusers.1 = getelementptr i8, ptr addrspace(5) %manyusers, i64 2
+ %v0 = load i8, ptr addrspace(5) %manyusers.1
+ %v0.ext = zext i8 %v0 to i32
+ store i32 %v0.ext, ptr addrspace(5) %manyusers.1
+
+ %manyusers.2 = getelementptr i8, ptr addrspace(5) %manyusers, i64 1
+ %v1 = load i8, ptr addrspace(5) %manyusers.2
+ %v1.ext = zext i8 %v0 to i32
+ store i32 %v1.ext, ptr addrspace(5) %manyusers.2
+
+ ret void
+}
+
+; CHECK: Scoring: %stack = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT: [+5]: store i32 32, ptr addrspace(5) %stack, align 4
+; CHECK-NEXT: [+1]: store i32 42, ptr addrspace(5) %stack, align 4
+; CHECK-NEXT: [+9]: store i32 32, ptr addrspace(5) %stack.1, align 4
+; CHECK-NEXT: [+5]: %outer.cmp = load i1, ptr addrspace(5) %stack.1, align 1
+; CHECK-NEXT: [+1]: store i32 64, ptr addrspace(5) %stack.2, align 4
+; CHECK-NEXT: [+9]: %inner.cmp = load i1, ptr addrspace(5) %stack.2, align 1
+; CHECK-NEXT: => Final Score:30
+define amdgpu_kernel void @loop_users_alloca(i1 %x, i2) #0 {
+entry:
+ ; should get a score of 1
+ %stack = alloca [4 x i64], align 4, addrspace(5)
+ %stack.1 = getelementptr i8, ptr addrspace(5) %stack, i64 4
+ %stack.2 = getelementptr i8, ptr addrspace(5) %stack, i64 8
+
+ store i32 42, ptr addrspace(5) %stack
+ br label %loop.outer
+
+loop.outer:
+ store i32 32, ptr addrspace(5) %stack
+ %outer.cmp = load i1, ptr addrspace(5) %stack.1
+ br label %loop.inner
+
+loop.inner:
+ store i32 32, ptr addrspace(5) %stack.1
+ %inner.cmp = load i1, ptr addrspace(5) %stack.2
+ br i1 %inner.cmp, label %loop.inner, label %loop.outer
+
+exit:
+ store i32 64, ptr addrspace(5) %stack.2
+ ret void
+}
More information about the llvm-commits
mailing list