[llvm] [AMDGPU] Add dynamic threshold for DPP atomic optimizer on integer LDS atomics (PR #186762)

Fri Mar 27 01:51:29 PDT 2026

================
@@ -955,23 +1042,139 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
       // first active lane.
       Result = B.CreateSelect(Cond, BroadcastI, Result);
     }
+    return Result;
+  }
+  return nullptr;
+}
 
-    if (IsPixelShader) {
-      // Need a final PHI to reconverge to above the helper lane branch mask.
-      B.SetInsertPoint(PixelExitBB, PixelExitBB->getFirstNonPHIIt());
+// Generate a dynamic branch based on the active lane count so that the
+// DPP scan path is only used when enough lanes are active to amortise its
+// overhead.  When the active count is at or below the threshold, each lane
+// independently issues its own atomic, which is cheaper for small groups.
+//
+// CFG produced:
+//
+//  EntryBB:
+//    Ballot / Ctpop -> ActiveCount
+//    cmp ActiveCount >= Threshold
+//    br -> DppBB or NoOptBB
+//
+//  DppBB:
+//    DPP scan / reduction, single-lane gate
+//    br -> SingleLaneBB or DppExitBB
+//
+//  SingleLaneBB:
+//    atomic with reduced value
+//    br -> DppExitBB
+//
+//  DppExitBB:
+//    readfirstlane + per-lane result
+//    br -> MergeBB
+//
+//  NoOptBB:
+//    original atomic (unoptimized)
+//    br -> MergeBB
+//
+//  MergeBB:
+//    PHI merges results from both paths
+Value *AMDGPUAtomicOptimizerImpl::optimizeAtomicWithDynamicThreshold(
+    IRBuilder<> &B, Instruction &I, AtomicRMWInst::BinOp Op,
+    unsigned ValIdx) const {
+  Type *const Ty = I.getType();
+  const unsigned TyBitWidth = DL.getTypeSizeInBits(Ty);
+  const bool NeedResult = !I.use_empty();
 
-      PHINode *const PHI = B.CreatePHI(Ty, 2);
-      PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB);
-      PHI->addIncoming(Result, I.getParent());
-      I.replaceAllUsesWith(PHI);
-    } else {
-      // Replace the original atomic instruction with the new one.
-      I.replaceAllUsesWith(Result);
-    }
+  Type *const WaveTy = B.getIntNTy(ST.getWavefrontSize());
+  CallInst *const Ballot =
+      B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
+
+  Value *Mbcnt = buildMbcnt(B, Ballot);
+
+  // Count active lanes.
+  Value *const Ctpop = B.CreateIntCast(
+      B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), B.getInt32Ty(), false);
+
+  // Branch: if active lanes > threshold, use DPP; otherwise no-opt.
+  const unsigned Threshold = AMDGPUAtomicOptimizerDPPLdsThreshold;
+  Value *const ThresholdCond = B.CreateICmpUGT(Ctpop, B.getInt32(Threshold));
+
+  BasicBlock *const EntryBB = I.getParent();
+  Instruction *const SplitPt = &I;
+  Instruction *const ThenTerm = SplitBlockAndInsertIfThen(
+      ThresholdCond, SplitPt, false, nullptr, &DTU, nullptr);
+  BasicBlock *const DppBB = ThenTerm->getParent();
+  BasicBlock *const MergeBB = I.getParent();
+
+  // Also create NoOptBB between DppBB and MergeBB.
+  BasicBlock *const NoOptBB = BasicBlock::Create(
+      I.getContext(), "atomicrmw.no_opt", I.getFunction(), MergeBB);
+  // Fix CFG: EntryBB's conditional branch false edge -> NoOptBB -> MergeBB.
+  BranchInst *const EntryBr = cast<BranchInst>(EntryBB->getTerminator());
+  EntryBr->setSuccessor(1, NoOptBB);
+  IRBuilder<> NoOptBuilder(NoOptBB);
+  NoOptBuilder.CreateBr(MergeBB);
+  DTU.applyUpdates({{DominatorTree::Insert, EntryBB, NoOptBB},
+                    {DominatorTree::Delete, EntryBB, MergeBB},
+                    {DominatorTree::Insert, NoOptBB, MergeBB}});
+
+  // DppBB: perform DPP scan/reduction and single-lane atomic.
+  B.SetInsertPoint(ThenTerm);
+
+  AtomicRMWInst::BinOp ScanOp = Op;
+  if (Op == AtomicRMWInst::Sub)
+    ScanOp = AtomicRMWInst::Add;
+
+  Value *Identity = getIdentityValueForAtomicOp(Ty, ScanOp);
+  Value *V = I.getOperand(ValIdx);
+
+  auto [NewV, ExclScan] =
+      buildDPPScanAndReduce(B, ScanOp, Ty, V, Identity, NeedResult);
+
+  Value *const DppIsFirst = B.CreateICmpEQ(Mbcnt, B.getInt32(0));
+
+  BasicBlock *const DppOrigBB = B.GetInsertBlock();
+  Instruction *const DppSingleTerm = SplitBlockAndInsertIfThen(
+      DppIsFirst, ThenTerm, false, nullptr, &DTU, nullptr);
+  B.SetInsertPoint(DppSingleTerm);
+  Instruction *const DppNewI = I.clone();
+  B.Insert(DppNewI);
+  DppNewI->setOperand(ValIdx, NewV);
+
+  Value *DppResult = nullptr;
+  if (NeedResult) {
----------------
taoyafan wrote:

Modified:
Instead of duplicating the DPP scan/reduction logic, delegate the actual
atomic optimization to optimizeAtomicImpl. Use SplitBlockAndInsertIfThenElse
to build the outer threshold CFG (OptBB/NoOptBB/TailBB), move I into OptBB,
and let optimizeAtomicImpl handle the rest.

https://github.com/llvm/llvm-project/pull/186762