[llvm] [AMDGPU] Refine AMDGPUAtomicOptimizerImpl class. NFC. (PR #118302)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 2 06:34:12 PST 2024
https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/118302
Use references instead of pointers for most state and common up some of
the initialization between the legacy and new pass manager paths.
>From 2e291c65f6229019b936ef3270cdc0d703d19f54 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Mon, 2 Dec 2024 14:23:51 +0000
Subject: [PATCH] [AMDGPU] Refine AMDGPUAtomicOptimizerImpl class. NFC.
Use references instead of pointers for most state and common up some of
the initialization between the legacy and new pass manager paths.
---
.../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 86 +++++++++----------
1 file changed, 39 insertions(+), 47 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index e4ca1ae0499b93..c09c71c83fead7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -66,11 +66,12 @@ class AMDGPUAtomicOptimizer : public FunctionPass {
class AMDGPUAtomicOptimizerImpl
: public InstVisitor<AMDGPUAtomicOptimizerImpl> {
private:
+ Function &F;
SmallVector<ReplacementInfo, 8> ToReplace;
- const UniformityInfo *UA;
- const DataLayout *DL;
+ const UniformityInfo &UA;
+ const DataLayout &DL;
DomTreeUpdater &DTU;
- const GCNSubtarget *ST;
+ const GCNSubtarget &ST;
bool IsPixelShader;
ScanOptions ScanImpl;
@@ -91,13 +92,14 @@ class AMDGPUAtomicOptimizerImpl
public:
AMDGPUAtomicOptimizerImpl() = delete;
- AMDGPUAtomicOptimizerImpl(const UniformityInfo *UA, const DataLayout *DL,
- DomTreeUpdater &DTU, const GCNSubtarget *ST,
- bool IsPixelShader, ScanOptions ScanImpl)
- : UA(UA), DL(DL), DTU(DTU), ST(ST), IsPixelShader(IsPixelShader),
+ AMDGPUAtomicOptimizerImpl(Function &F, const UniformityInfo &UA,
+ DomTreeUpdater &DTU, const GCNSubtarget &ST,
+ ScanOptions ScanImpl)
+ : F(F), UA(UA), DL(F.getDataLayout()), DTU(DTU), ST(ST),
+ IsPixelShader(F.getCallingConv() == CallingConv::AMDGPU_PS),
ScanImpl(ScanImpl) {}
- bool run(Function &F);
+ bool run();
void visitAtomicRMWInst(AtomicRMWInst &I);
void visitIntrinsicInst(IntrinsicInst &I);
@@ -114,40 +116,30 @@ bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
return false;
}
- const UniformityInfo *UA =
- &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
- const DataLayout *DL = &F.getDataLayout();
+ const UniformityInfo &UA =
+ getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
- DominatorTreeWrapperPass *const DTW =
+ DominatorTreeWrapperPass *DTW =
getAnalysisIfAvailable<DominatorTreeWrapperPass>();
DomTreeUpdater DTU(DTW ? &DTW->getDomTree() : nullptr,
DomTreeUpdater::UpdateStrategy::Lazy);
const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
const TargetMachine &TM = TPC.getTM<TargetMachine>();
- const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
- bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
-
- return AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl)
- .run(F);
+ return AMDGPUAtomicOptimizerImpl(F, UA, DTU, ST, ScanImpl).run();
}
PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F,
FunctionAnalysisManager &AM) {
-
- const auto *UA = &AM.getResult<UniformityInfoAnalysis>(F);
- const DataLayout *DL = &F.getDataLayout();
+ const auto &UA = AM.getResult<UniformityInfoAnalysis>(F);
DomTreeUpdater DTU(&AM.getResult<DominatorTreeAnalysis>(F),
DomTreeUpdater::UpdateStrategy::Lazy);
- const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);
-
- bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
- bool IsChanged =
- AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl)
- .run(F);
+ bool IsChanged = AMDGPUAtomicOptimizerImpl(F, UA, DTU, ST, ScanImpl).run();
if (!IsChanged) {
return PreservedAnalyses::all();
@@ -158,7 +150,7 @@ PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F,
return PA;
}
-bool AMDGPUAtomicOptimizerImpl::run(Function &F) {
+bool AMDGPUAtomicOptimizerImpl::run() {
// Scan option None disables the Pass
if (ScanImpl == ScanOptions::None) {
@@ -234,18 +226,18 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
// If the pointer operand is divergent, then each lane is doing an atomic
// operation on a different address, and we cannot optimize that.
- if (UA->isDivergentUse(I.getOperandUse(PtrIdx))) {
+ if (UA.isDivergentUse(I.getOperandUse(PtrIdx))) {
return;
}
- bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));
+ bool ValDivergent = UA.isDivergentUse(I.getOperandUse(ValIdx));
// If the value operand is divergent, each lane is contributing a different
// value to the atomic calculation. We can only optimize divergent values if
// we have DPP available on our subtarget (for DPP strategy), and the atomic
// operation is 32 or 64 bits.
if (ValDivergent) {
- if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())
+ if (ScanImpl == ScanOptions::DPP && !ST.hasDPP())
return;
if (!isLegalCrossLaneType(I.getType()))
@@ -324,14 +316,14 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
const unsigned ValIdx = 0;
- const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));
+ const bool ValDivergent = UA.isDivergentUse(I.getOperandUse(ValIdx));
// If the value operand is divergent, each lane is contributing a different
// value to the atomic calculation. We can only optimize divergent values if
// we have DPP available on our subtarget (for DPP strategy), and the atomic
// operation is 32 or 64 bits.
if (ValDivergent) {
- if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())
+ if (ScanImpl == ScanOptions::DPP && !ST.hasDPP())
return;
if (!isLegalCrossLaneType(I.getType()))
@@ -341,7 +333,7 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
// If any of the other arguments to the intrinsic are divergent, we can't
// optimize the operation.
for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {
- if (UA->isDivergentUse(I.getOperandUse(Idx))) {
+ if (UA.isDivergentUse(I.getOperandUse(Idx))) {
return;
}
}
@@ -418,17 +410,17 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
}
// Reduce within each pair of rows (i.e. 32 lanes).
- assert(ST->hasPermLaneX16());
+ assert(ST.hasPermLaneX16());
Value *Permlanex16Call =
B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlanex16,
{PoisonValue::get(AtomicTy), V, B.getInt32(0),
B.getInt32(0), B.getFalse(), B.getFalse()});
V = buildNonAtomicBinOp(B, Op, V, Permlanex16Call);
- if (ST->isWave32()) {
+ if (ST.isWave32()) {
return V;
}
- if (ST->hasPermLane64()) {
+ if (ST.hasPermLane64()) {
// Reduce across the upper and lower 32 lanes.
Value *Permlane64Call =
B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlane64, V);
@@ -461,7 +453,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
{Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx),
B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
}
- if (ST->hasDPPBroadcasts()) {
+ if (ST.hasDPPBroadcasts()) {
// GFX9 has DPP row broadcast operations.
V = buildNonAtomicBinOp(
B, Op, V,
@@ -479,7 +471,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
// Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
// 48..63).
- assert(ST->hasPermLaneX16());
+ assert(ST.hasPermLaneX16());
Value *PermX =
B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlanex16,
{PoisonValue::get(AtomicTy), V, B.getInt32(-1),
@@ -490,7 +482,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
B.getInt32(0xa), B.getInt32(0xf), B.getFalse()});
V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall);
- if (!ST->isWave32()) {
+ if (!ST.isWave32()) {
// Combine lane 31 into lanes 32..63.
Value *const Lane31 = B.CreateIntrinsic(
AtomicTy, Intrinsic::amdgcn_readlane, {V, B.getInt32(31)});
@@ -513,7 +505,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP = Intrinsic::getOrInsertDeclaration(
M, Intrinsic::amdgcn_update_dpp, AtomicTy);
- if (ST->hasDPPWavefrontShifts()) {
+ if (ST.hasDPPWavefrontShifts()) {
// GFX9 has DPP wavefront shift operations.
V = B.CreateCall(UpdateDPP,
{Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
@@ -535,7 +527,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),
B.getInt32(16), V});
- if (!ST->isWave32()) {
+ if (!ST.isWave32()) {
// Copy the old lane 31 to the new lane 32.
V = B.CreateCall(
WriteLane,
@@ -560,7 +552,7 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V,
Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const {
auto *Ty = I.getType();
- auto *WaveTy = B.getIntNTy(ST->getWavefrontSize());
+ auto *WaveTy = B.getIntNTy(ST.getWavefrontSize());
auto *EntryBB = I.getParent();
auto NeedResult = !I.use_empty();
@@ -698,7 +690,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
Type *const Ty = I.getType();
Type *Int32Ty = B.getInt32Ty();
bool isAtomicFloatingPointTy = Ty->isFloatingPointTy();
- [[maybe_unused]] const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
+ [[maybe_unused]] const unsigned TyBitWidth = DL.getTypeSizeInBits(Ty);
// This is the value in the atomic operation we need to combine in order to
// reduce the number of atomic operations.
@@ -706,7 +698,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// We need to know how many lanes are active within the wavefront, and we do
// this by doing a ballot of active lanes.
- Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
+ Type *const WaveTy = B.getIntNTy(ST.getWavefrontSize());
CallInst *const Ballot =
B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
@@ -715,7 +707,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// below us only if its associated index was less than ours. We do this by
// using the mbcnt intrinsic.
Value *Mbcnt;
- if (ST->isWave32()) {
+ if (ST.isWave32()) {
Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
{Ballot, B.getInt32(0)});
} else {
@@ -755,7 +747,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// that they can correctly contribute to the final result.
NewV =
B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
- if (!NeedResult && ST->hasPermLaneX16()) {
+ if (!NeedResult && ST.hasPermLaneX16()) {
// On GFX10 the permlanex16 instruction helps us build a reduction
// without too many readlanes and writelanes, which are generally bad
// for performance.
@@ -767,7 +759,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// Read the value from the last lane, which has accumulated the values
// of each active lane in the wavefront. This will be our new value
// which we will provide to the atomic operation.
- Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
+ Value *const LastLaneIdx = B.getInt32(ST.getWavefrontSize() - 1);
NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,
{NewV, LastLaneIdx});
}
More information about the llvm-commits
mailing list