================
@@ -955,23 +1042,139 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// first active lane.
Result = B.CreateSelect(Cond, BroadcastI, Result);
}
+ return Result;
+ }
+ return nullptr;
+}
- if (IsPixelShader) {
- // Need a final PHI to reconverge to above the helper lane branch mask.
- B.SetInsertPoint(PixelExitBB, PixelExitBB->getFirstNonPHIIt());
+// Generate a dynamic branch based on the active lane count so that the
+// DPP scan path is only used when enough lanes are active to amortise its
+// overhead. When the active count is at or below the threshold, each lane
+// independently issues its own atomic, which is cheaper for small groups.
+//
+// CFG produced:
+//
+// EntryBB:
+// Ballot / Ctpop -> ActiveCount
+// cmp ActiveCount >= Threshold
+// br -> DppBB or NoOptBB
+//
+// DppBB:
+// DPP scan / reduction, single-lane gate
+// br -> SingleLaneBB or DppExitBB
+//
+// SingleLaneBB:
+// atomic with reduced value
+// br -> DppExitBB
+//
+// DppExitBB:
+// readfirstlane + per-lane result
+// br -> MergeBB
+//
+// NoOptBB:
+// original atomic (unoptimized)
+// br -> MergeBB
+//
+// MergeBB:
+// PHI merges results from both paths
+Value *AMDGPUAtomicOptimizerImpl::optimizeAtomicWithDynamicThreshold(
+ IRBuilder<> &B, Instruction &I, AtomicRMWInst::BinOp Op,
+ unsigned ValIdx) const {
+ Type *const Ty = I.getType();
+ const unsigned TyBitWidth = DL.getTypeSizeInBits(Ty);
+ const bool NeedResult = !I.use_empty();
- PHINode *const PHI = B.CreatePHI(Ty, 2);
- PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB);
- PHI->addIncoming(Result, I.getParent());
- I.replaceAllUsesWith(PHI);
- } else {
- // Replace the original atomic instruction with the new one.
- I.replaceAllUsesWith(Result);
- }
+ Type *const WaveTy = B.getIntNTy(ST.getWavefrontSize());
+ CallInst *const Ballot =
+ B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
+
+ Value *Mbcnt = buildMbcnt(B, Ballot);
+
+ // Count active lanes.
+ Value *const Ctpop = B.CreateIntCast(
+ B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), B.getInt32Ty(), false);
+
+ // Branch: if active lanes > threshold, use DPP; otherwise no-opt.
+ const unsigned Threshold = AMDGPUAtomicOptimizerDPPLdsThreshold;
+ Value *const ThresholdCond = B.CreateICmpUGT(Ctpop, B.getInt32(Threshold));
+
+ BasicBlock *const EntryBB = I.getParent();
+ Instruction *const SplitPt = &I;
+ Instruction *const ThenTerm = SplitBlockAndInsertIfThen(
+ ThresholdCond, SplitPt, false, nullptr, &DTU, nullptr);
+ BasicBlock *const DppBB = ThenTerm->getParent();
+ BasicBlock *const MergeBB = I.getParent();
+
+ // Also create NoOptBB between DppBB and MergeBB.
+ BasicBlock *const NoOptBB = BasicBlock::Create(
+ I.getContext(), "atomicrmw.no_opt", I.getFunction(), MergeBB);
+ // Fix CFG: EntryBB's conditional branch false edge -> NoOptBB -> MergeBB.
+ BranchInst *const EntryBr = cast<BranchInst>(EntryBB->getTerminator());
+ EntryBr->setSuccessor(1, NoOptBB);
+ IRBuilder<> NoOptBuilder(NoOptBB);
+ NoOptBuilder.CreateBr(MergeBB);
+ DTU.applyUpdates({{DominatorTree::Insert, EntryBB, NoOptBB},
+ {DominatorTree::Delete, EntryBB, MergeBB},
+ {DominatorTree::Insert, NoOptBB, MergeBB}});
+
+ // DppBB: perform DPP scan/reduction and single-lane atomic.
+ B.SetInsertPoint(ThenTerm);
+
+ AtomicRMWInst::BinOp ScanOp = Op;
+ if (Op == AtomicRMWInst::Sub)
+ ScanOp = AtomicRMWInst::Add;
+
+ Value *Identity = getIdentityValueForAtomicOp(Ty, ScanOp);
+ Value *V = I.getOperand(ValIdx);
+
+ auto [NewV, ExclScan] =
+ buildDPPScanAndReduce(B, ScanOp, Ty, V, Identity, NeedResult);
+
+ Value *const DppIsFirst = B.CreateICmpEQ(Mbcnt, B.getInt32(0));
+
+ BasicBlock *const DppOrigBB = B.GetInsertBlock();
+ Instruction *const DppSingleTerm = SplitBlockAndInsertIfThen(
+ DppIsFirst, ThenTerm, false, nullptr, &DTU, nullptr);
+ B.SetInsertPoint(DppSingleTerm);
+ Instruction *const DppNewI = I.clone();
+ B.Insert(DppNewI);
+ DppNewI->setOperand(ValIdx, NewV);
+
+ Value *DppResult = nullptr;
+ if (NeedResult) {
----------------
taoyafan wrote:
Modified:
Instead of duplicating the DPP scan/reduction logic, delegate the actual
atomic optimization to optimizeAtomicImpl. Use SplitBlockAndInsertIfThenElse
to build the outer threshold CFG (OptBB/NoOptBB/TailBB), move I into OptBB,
and let optimizeAtomicImpl handle the rest.
https://github.com/llvm/llvm-project/pull/186762