[llvm] [AMDGPU] Add dynamic threshold for DPP atomic optimizer on integer LDS atomics (PR #186762)
YaFan Tao via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 27 01:48:14 PDT 2026
https://github.com/taoyafan updated https://github.com/llvm/llvm-project/pull/186762
>From 03ba3d87a73faaf5bbc9d3510b683077fab3d2e5 Mon Sep 17 00:00:00 2001
From: YaFan <YaFan.Tao at amd.com>
Date: Mon, 16 Mar 2026 17:23:06 +0800
Subject: [PATCH 1/6] [AMDGPU] Add dynamic threshold for DPP atomic optimizer
on integer LDS atomics
When only a few lanes are active, the DPP wavefront scan overhead can
outweigh the benefit of reducing LDS atomic contention. Profiling a
DX12 application showed that the affected shader ran ~20 percent faster
(and the whole application ~2.7 percent faster) with the atomic
optimizer disabled entirely.
Add a hidden command-line option -amdgpu-atomic-optimizer-dpp-lds-threshold
that emits a dynamic branch: when the active lane count exceeds the
threshold the existing DPP scan path is taken; otherwise each lane
independently issues its own atomic.
Only integer LDS atomics are affected because DX12 does not expose
floating-point LDS atomic instructions, so the float path is untested.
The default threshold is 0 (always use DPP) because unconditionally
enabling the threshold can regress DPP-friendly workloads by ~4 percent.
Implementation details:
- Extract buildMbcnt and buildDPPScanAndReduce as reusable helpers.
- Extract optimizeAtomicImpl from optimizeAtomic so the threshold path
can share the pixel-shader and replacement logic.
- Add optimizeAtomicWithDynamicThreshold which builds the two-path CFG.
- Add atomic_optimizations_dpp_lds_threshold.ll testing threshold=5
(dynamic branch) and threshold>=wavefront-size (DPP fully disabled).
---
.../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 308 +++-
.../atomic_optimizations_dpp_lds_threshold.ll | 1528 +++++++++++++++++
2 files changed, 1784 insertions(+), 52 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/atomic_optimizations_dpp_lds_threshold.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 784ee36d55c1e..52996dd2ddc66 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -38,6 +38,23 @@
using namespace llvm;
using namespace llvm::AMDGPU;
+// When only a few lanes are active, the DPP scan overhead can outweigh the
+// benefit of reducing LDS atomic contention. This threshold lets the compiler
+// emit a dynamic branch so that small-active-lane cases fall back to
+// unoptimized per-lane atomics while large-active-lane cases still benefit from
+// DPP.
+//
+// Only integer LDS atomics are affected because floating-point path is
+// untested. The default is 0 (always use DPP) because unconditionally enabling
+// the threshold can regress DPP-friendly workloads by ~4% (tested on RX
+// 7900XT).
+static cl::opt<unsigned> AMDGPUAtomicOptimizerDPPLdsThreshold(
+ "amdgpu-atomic-optimizer-dpp-lds-threshold",
+ cl::desc("Use DPP scan for integer LDS atomics only when active lanes > "
+ "this value; floating-point LDS atomics are not affected "
+ "(0 = always use DPP, >= wavefront size = never use DPP)"),
+ cl::init(0), cl::Hidden);
+
namespace {
struct ReplacementInfo {
@@ -45,6 +62,7 @@ struct ReplacementInfo {
AtomicRMWInst::BinOp Op;
unsigned ValIdx;
bool ValDivergent;
+ bool IsLDS;
};
class AMDGPUAtomicOptimizer : public FunctionPass {
@@ -86,8 +104,22 @@ class AMDGPUAtomicOptimizerImpl
Value *const Identity, Value *V, Instruction &I,
BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const;
+ Value *buildMbcnt(IRBuilder<> &B, Value *Ballot) const;
+
+ std::pair<Value *, Value *>
+ buildDPPScanAndReduce(IRBuilder<> &B, AtomicRMWInst::BinOp ScanOp, Type *Ty,
+ Value *V, Value *Identity, bool NeedResult) const;
+
void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
- bool ValDivergent) const;
+ bool ValDivergent, bool IsLDS) const;
+
+ Value *optimizeAtomicImpl(IRBuilder<> &B, Instruction &I,
+ AtomicRMWInst::BinOp Op, unsigned ValIdx,
+ bool ValDivergent) const;
+
+ Value *optimizeAtomicWithDynamicThreshold(IRBuilder<> &B, Instruction &I,
+ AtomicRMWInst::BinOp Op,
+ unsigned ValIdx) const;
public:
AMDGPUAtomicOptimizerImpl() = delete;
@@ -159,8 +191,8 @@ bool AMDGPUAtomicOptimizerImpl::run() {
if (ToReplace.empty())
return false;
- for (auto &[I, Op, ValIdx, ValDivergent] : ToReplace)
- optimizeAtomic(*I, Op, ValIdx, ValDivergent);
+ for (auto &[I, Op, ValIdx, ValDivergent, IsLDS] : ToReplace)
+ optimizeAtomic(*I, Op, ValIdx, ValDivergent, IsLDS);
ToReplace.clear();
return true;
}
@@ -239,10 +271,12 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
return;
}
+ const bool IsLDS = I.getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+
// If we get here, we can optimize the atomic using a single wavefront-wide
// atomic operation to do the calculation for the entire wavefront, so
// remember the instruction so we can come back to it.
- ToReplace.push_back({&I, Op, ValIdx, ValDivergent});
+ ToReplace.push_back({&I, Op, ValIdx, ValDivergent, IsLDS});
}
void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
@@ -333,7 +367,8 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
// If we get here, we can optimize the atomic using a single wavefront-wide
// atomic operation to do the calculation for the entire wavefront, so
// remember the instruction so we can come back to it.
- ToReplace.push_back({&I, Op, ValIdx, ValDivergent});
+ // Buffer atomics are never LDS.
+ ToReplace.push_back({&I, Op, ValIdx, ValDivergent, /*IsLDS=*/false});
}
// Use the builder to create the non-atomic counterpart of the specified
@@ -602,6 +637,47 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
return {OldValue, NewAccumulator};
}
+// Use the builder to create an mbcnt intrinsic call to count the number of
+// active lanes below the current lane.
+Value *AMDGPUAtomicOptimizerImpl::buildMbcnt(IRBuilder<> &B,
+ Value *Ballot) const {
+ Type *Int32Ty = B.getInt32Ty();
+ if (ST.isWave32()) {
+ return B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo,
+ {Ballot, B.getInt32(0)});
+ }
+ Value *const ExtractLo = B.CreateTrunc(Ballot, Int32Ty);
+ Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(Ballot, 32), Int32Ty);
+ Value *Mbcnt =
+ B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {ExtractLo, B.getInt32(0)});
+ return B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {ExtractHi, Mbcnt});
+}
+
+// Use the builder to create a DPP-based scan or reduction across the
+// wavefront on divergent values. Sets inactive lanes to the identity value,
+// then either builds a full scan (when the result is needed) or just a
+// reduction. Returns {ReducedValue, ExclusiveScan}; ExclusiveScan is nullptr
+// when !NeedResult && ST.hasPermLaneX16().
+std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildDPPScanAndReduce(
+ IRBuilder<> &B, AtomicRMWInst::BinOp ScanOp, Type *Ty, Value *V,
+ Value *Identity, bool NeedResult) const {
+ Value *NewV =
+ B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
+ Value *ExclScan = nullptr;
+ if (!NeedResult && ST.hasPermLaneX16()) {
+ NewV = buildReduction(B, ScanOp, NewV, Identity);
+ } else {
+ NewV = buildScan(B, ScanOp, NewV, Identity);
+ if (NeedResult)
+ ExclScan = buildShiftRight(B, NewV, Identity);
+ Value *const LastLaneIdx = B.getInt32(ST.getWavefrontSize() - 1);
+ NewV =
+ B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane, {NewV, LastLaneIdx});
+ }
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
+ return {NewV, ExclScan};
+}
+
static Constant *getIdentityValueForAtomicOp(Type *const Ty,
AtomicRMWInst::BinOp Op) {
LLVMContext &C = Ty->getContext();
@@ -644,7 +720,8 @@ static Value *buildMul(IRBuilder<> &B, Value *LHS, Value *RHS) {
void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
AtomicRMWInst::BinOp Op,
unsigned ValIdx,
- bool ValDivergent) const {
+ bool ValDivergent,
+ bool IsLDS) const {
// Start building just before the instruction.
IRBuilder<> B(&I);
@@ -677,6 +754,47 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
B.SetInsertPoint(&I);
}
+ // Decide which implementation path to use.
+ // For DPP strategy with integer, divergent-value LDS atomics, when a
+ // threshold is set, use dynamic branching based on active lane count.
+ const unsigned Threshold = AMDGPUAtomicOptimizerDPPLdsThreshold;
+ Value *Result = nullptr;
+ if (IsLDS && ValDivergent && ScanImpl == ScanOptions::DPP &&
+ !AtomicRMWInst::isFPOperation(Op) && Threshold > 0 &&
+ Threshold < ST.getWavefrontSize()) {
+ Result = optimizeAtomicWithDynamicThreshold(B, I, Op, ValIdx);
+ } else {
+ Result = optimizeAtomicImpl(B, I, Op, ValIdx, ValDivergent);
+ }
+
+ // Handle pixel shader reconvergence and replace original instruction.
+ if (Result) {
+ if (IsPixelShader) {
+ // Need a final PHI to reconverge to above the helper lane branch mask.
+ B.SetInsertPoint(PixelExitBB, PixelExitBB->getFirstNonPHIIt());
+
+ PHINode *const PHI = B.CreatePHI(I.getType(), 2);
+ PHI->addIncoming(PoisonValue::get(I.getType()), PixelEntryBB);
+ PHI->addIncoming(Result, I.getParent());
+ I.replaceAllUsesWith(PHI);
+ } else {
+ // Replace the original atomic instruction with the new one.
+ I.replaceAllUsesWith(Result);
+ }
+ }
+
+ // And delete the original.
+ I.eraseFromParent();
+}
+
+// Replace a single atomic with a wavefront-wide scan/reduction followed by a
+// single-lane atomic, then reconstruct per-lane results. Returns the per-lane
+// result, or nullptr when the result is unused.
+Value *AMDGPUAtomicOptimizerImpl::optimizeAtomicImpl(IRBuilder<> &B,
+ Instruction &I,
+ AtomicRMWInst::BinOp Op,
+ unsigned ValIdx,
+ bool ValDivergent) const {
Type *const Ty = I.getType();
Type *Int32Ty = B.getInt32Ty();
bool isAtomicFloatingPointTy = Ty->isFloatingPointTy();
@@ -696,17 +814,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// below us. If we counted each lane linearly starting from 0, a lane is
// below us only if its associated index was less than ours. We do this by
// using the mbcnt intrinsic.
- Value *Mbcnt;
- if (ST.isWave32()) {
- Mbcnt =
- B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {Ballot, B.getInt32(0)});
- } else {
- Value *const ExtractLo = B.CreateTrunc(Ballot, Int32Ty);
- Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(Ballot, 32), Int32Ty);
- Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo,
- {ExtractLo, B.getInt32(0)});
- Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {ExtractHi, Mbcnt});
- }
+ Value *Mbcnt = buildMbcnt(B, Ballot);
Function *F = I.getFunction();
LLVMContext &C = F->getContext();
@@ -732,28 +840,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// using DPP.
if (ValDivergent) {
if (ScanImpl == ScanOptions::DPP) {
- // First we need to set all inactive invocations to the identity value, so
- // that they can correctly contribute to the final result.
- NewV =
- B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
- if (!NeedResult && ST.hasPermLaneX16()) {
- // On GFX10 the permlanex16 instruction helps us build a reduction
- // without too many readlanes and writelanes, which are generally bad
- // for performance.
- NewV = buildReduction(B, ScanOp, NewV, Identity);
- } else {
- NewV = buildScan(B, ScanOp, NewV, Identity);
- if (NeedResult)
- ExclScan = buildShiftRight(B, NewV, Identity);
- // Read the value from the last lane, which has accumulated the values
- // of each active lane in the wavefront. This will be our new value
- // which we will provide to the atomic operation.
- Value *const LastLaneIdx = B.getInt32(ST.getWavefrontSize() - 1);
- NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,
- {NewV, LastLaneIdx});
- }
- // Finally mark the readlanes in the WWM section.
- NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
+ std::tie(NewV, ExclScan) =
+ buildDPPScanAndReduce(B, ScanOp, Ty, V, Identity, NeedResult);
} else if (ScanImpl == ScanOptions::Iterative) {
// Alternative implementation for scan
ComputeLoop = BasicBlock::Create(C, "ComputeLoop", F);
@@ -955,23 +1043,139 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// first active lane.
Result = B.CreateSelect(Cond, BroadcastI, Result);
}
+ return Result;
+ }
+ return nullptr;
+}
- if (IsPixelShader) {
- // Need a final PHI to reconverge to above the helper lane branch mask.
- B.SetInsertPoint(PixelExitBB, PixelExitBB->getFirstNonPHIIt());
+// Generate a dynamic branch based on the active lane count so that the
+// DPP scan path is only used when enough lanes are active to amortise its
+// overhead. When the active count is at or below the threshold, each lane
+// independently issues its own atomic, which is cheaper for small groups.
+//
+// CFG produced:
+//
+// EntryBB:
+// Ballot / Ctpop -> ActiveCount
+// cmp ActiveCount >= Threshold
+// br -> DppBB or NoOptBB
+//
+// DppBB:
+// DPP scan / reduction, single-lane gate
+// br -> SingleLaneBB or DppExitBB
+//
+// SingleLaneBB:
+// atomic with reduced value
+// br -> DppExitBB
+//
+// DppExitBB:
+// readfirstlane + per-lane result
+// br -> MergeBB
+//
+// NoOptBB:
+// original atomic (unoptimized)
+// br -> MergeBB
+//
+// MergeBB:
+// PHI merges results from both paths
+Value *AMDGPUAtomicOptimizerImpl::optimizeAtomicWithDynamicThreshold(
+ IRBuilder<> &B, Instruction &I, AtomicRMWInst::BinOp Op,
+ unsigned ValIdx) const {
+ Type *const Ty = I.getType();
+ const unsigned TyBitWidth = DL.getTypeSizeInBits(Ty);
+ const bool NeedResult = !I.use_empty();
- PHINode *const PHI = B.CreatePHI(Ty, 2);
- PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB);
- PHI->addIncoming(Result, I.getParent());
- I.replaceAllUsesWith(PHI);
- } else {
- // Replace the original atomic instruction with the new one.
- I.replaceAllUsesWith(Result);
- }
+ Type *const WaveTy = B.getIntNTy(ST.getWavefrontSize());
+ CallInst *const Ballot =
+ B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
+
+ Value *Mbcnt = buildMbcnt(B, Ballot);
+
+ // Count active lanes.
+ Value *const Ctpop = B.CreateIntCast(
+ B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), B.getInt32Ty(), false);
+
+ // Branch: if active lanes > threshold, use DPP; otherwise no-opt.
+ const unsigned Threshold = AMDGPUAtomicOptimizerDPPLdsThreshold;
+ Value *const ThresholdCond = B.CreateICmpUGT(Ctpop, B.getInt32(Threshold));
+
+ BasicBlock *const EntryBB = I.getParent();
+ Instruction *const SplitPt = &I;
+ Instruction *const ThenTerm = SplitBlockAndInsertIfThen(
+ ThresholdCond, SplitPt, false, nullptr, &DTU, nullptr);
+ BasicBlock *const DppBB = ThenTerm->getParent();
+ BasicBlock *const MergeBB = I.getParent();
+
+ // Also create NoOptBB between DppBB and MergeBB.
+ BasicBlock *const NoOptBB = BasicBlock::Create(
+ I.getContext(), "atomicrmw.no_opt", I.getFunction(), MergeBB);
+ // Fix CFG: EntryBB's conditional branch false edge -> NoOptBB -> MergeBB.
+ BranchInst *const EntryBr = cast<BranchInst>(EntryBB->getTerminator());
+ EntryBr->setSuccessor(1, NoOptBB);
+ IRBuilder<> NoOptBuilder(NoOptBB);
+ NoOptBuilder.CreateBr(MergeBB);
+ DTU.applyUpdates({{DominatorTree::Insert, EntryBB, NoOptBB},
+ {DominatorTree::Delete, EntryBB, MergeBB},
+ {DominatorTree::Insert, NoOptBB, MergeBB}});
+
+ // DppBB: perform DPP scan/reduction and single-lane atomic.
+ B.SetInsertPoint(ThenTerm);
+
+ AtomicRMWInst::BinOp ScanOp = Op;
+ if (Op == AtomicRMWInst::Sub)
+ ScanOp = AtomicRMWInst::Add;
+
+ Value *Identity = getIdentityValueForAtomicOp(Ty, ScanOp);
+ Value *V = I.getOperand(ValIdx);
+
+ auto [NewV, ExclScan] =
+ buildDPPScanAndReduce(B, ScanOp, Ty, V, Identity, NeedResult);
+
+ Value *const DppIsFirst = B.CreateICmpEQ(Mbcnt, B.getInt32(0));
+
+ BasicBlock *const DppOrigBB = B.GetInsertBlock();
+ Instruction *const DppSingleTerm = SplitBlockAndInsertIfThen(
+ DppIsFirst, ThenTerm, false, nullptr, &DTU, nullptr);
+ B.SetInsertPoint(DppSingleTerm);
+ Instruction *const DppNewI = I.clone();
+ B.Insert(DppNewI);
+ DppNewI->setOperand(ValIdx, NewV);
+
+ Value *DppResult = nullptr;
+ if (NeedResult) {
+ B.SetInsertPoint(ThenTerm);
+ PHINode *const DppPHI = B.CreatePHI(Ty, 2);
+ DppPHI->addIncoming(PoisonValue::get(Ty), DppOrigBB);
+ DppPHI->addIncoming(DppNewI, DppSingleTerm->getParent());
+
+ Value *ReadlaneVal = DppPHI;
+ if (TyBitWidth < 32)
+ ReadlaneVal = B.CreateZExt(DppPHI, B.getInt32Ty());
+ Value *BroadcastI = B.CreateIntrinsic(
+ ReadlaneVal->getType(), Intrinsic::amdgcn_readfirstlane, ReadlaneVal);
+ if (TyBitWidth < 32)
+ BroadcastI = B.CreateTrunc(BroadcastI, Ty);
+
+ Value *LaneOffset =
+ B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
+ DppResult = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
}
- // And delete the original.
- I.eraseFromParent();
+ // NoOptBB: each lane independently issues its own atomic.
+ NoOptBuilder.SetInsertPoint(NoOptBB->getTerminator());
+ Instruction *const NoOptNewI = I.clone();
+ NoOptBuilder.Insert(NoOptNewI);
+ Value *NoOptResult = NeedResult ? static_cast<Value *>(NoOptNewI) : nullptr;
+
+ // MergeBB: PHI-merge results from DppBB and NoOptBB.
+ if (NeedResult) {
+ B.SetInsertPoint(MergeBB, MergeBB->getFirstNonPHIIt());
+ PHINode *const MergePHI = B.CreatePHI(Ty, 2);
+ MergePHI->addIncoming(DppResult, ThenTerm->getParent());
+ MergePHI->addIncoming(NoOptResult, NoOptBB);
+ return MergePHI;
+ }
+ return nullptr;
}
INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE,
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_dpp_lds_threshold.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_dpp_lds_threshold.ll
new file mode 100644
index 0000000000000..0ef74d7ca2c02
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_dpp_lds_threshold.ll
@@ -0,0 +1,1528 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+;
+; Test the -amdgpu-atomic-optimizer-dpp-lds-threshold option which controls
+; dynamic DPP vs no-opt branching for integer LDS atomics.
+;
+; Threshold=5: use DPP only when active lanes > 5, otherwise each lane does
+; its own atomic.
+; Threshold=32: on wave32 this disables DPP entirely (>= wavefront size).
+; Threshold=64: on wave64 this disables DPP entirely (>= wavefront size).
+
+; --- Threshold=5 tests (dynamic branch expected) ---
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -amdgpu-atomic-optimizer-dpp-lds-threshold=5 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-THRESH5 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -amdgpu-atomic-optimizer-dpp-lds-threshold=5 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-THRESH5 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -amdgpu-atomic-optimizer-dpp-lds-threshold=5 < %s | FileCheck -enable-var-scope -check-prefixes=GFX900-THRESH5 %s
+
+; --- Threshold >= wavefront size tests (DPP fully disabled for int LDS) ---
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -amdgpu-atomic-optimizer-dpp-lds-threshold=32 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DISABLED %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -amdgpu-atomic-optimizer-dpp-lds-threshold=64 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DISABLED %s
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+ at local_var32 = addrspace(3) global i32 undef, align 4
+
+; Test 1: divergent i32 add with result used -- dynamic threshold branch expected
+define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
+; GFX1032-THRESH5-LABEL: add_i32_varying:
+; GFX1032-THRESH5: ; %bb.0: ; %entry
+; GFX1032-THRESH5-NEXT: s_bcnt1_i32_b32 s0, exec_lo
+; GFX1032-THRESH5-NEXT: s_cmp_lt_u32 s0, 6
+; GFX1032-THRESH5-NEXT: s_cbranch_scc0 .LBB0_2
+; GFX1032-THRESH5-NEXT: ; %bb.1: ; %atomicrmw.no_opt
+; GFX1032-THRESH5-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-THRESH5-NEXT: ds_add_rtn_u32 v4, v4, v0
+; GFX1032-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-THRESH5-NEXT: buffer_gl0_inv
+; GFX1032-THRESH5-NEXT: s_cbranch_execz .LBB0_3
+; GFX1032-THRESH5-NEXT: s_branch .LBB0_6
+; GFX1032-THRESH5-NEXT: .LBB0_2:
+; GFX1032-THRESH5-NEXT: ; implicit-def: $vgpr4
+; GFX1032-THRESH5-NEXT: .LBB0_3:
+; GFX1032-THRESH5-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-THRESH5-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1032-THRESH5-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-THRESH5-NEXT: v_permlanex16_b32 v2, v1, -1, -1
+; GFX1032-THRESH5-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032-THRESH5-NEXT: v_readlane_b32 s2, v1, 31
+; GFX1032-THRESH5-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032-THRESH5-NEXT: v_readlane_b32 s1, v1, 15
+; GFX1032-THRESH5-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-THRESH5-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1032-THRESH5-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-THRESH5-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-THRESH5-NEXT: v_writelane_b32 v3, s1, 16
+; GFX1032-THRESH5-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-THRESH5-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4
+; GFX1032-THRESH5-NEXT: ; implicit-def: $vgpr4
+; GFX1032-THRESH5-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-THRESH5-NEXT: s_cbranch_execz .LBB0_5
+; GFX1032-THRESH5-NEXT: ; %bb.4:
+; GFX1032-THRESH5-NEXT: v_mov_b32_e32 v4, s2
+; GFX1032-THRESH5-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1032-THRESH5-NEXT: ds_add_rtn_u32 v4, v0, v4
+; GFX1032-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-THRESH5-NEXT: buffer_gl0_inv
+; GFX1032-THRESH5-NEXT: .LBB0_5:
+; GFX1032-THRESH5-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032-THRESH5-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-THRESH5-NEXT: v_readfirstlane_b32 s0, v4
+; GFX1032-THRESH5-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-THRESH5-NEXT: v_add_nc_u32_e32 v4, s0, v0
+; GFX1032-THRESH5-NEXT: .LBB0_6:
+; GFX1032-THRESH5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032-THRESH5-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032-THRESH5-NEXT: s_mov_b32 s2, -1
+; GFX1032-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-THRESH5-NEXT: buffer_store_dword v4, off, s[0:3], 0
+; GFX1032-THRESH5-NEXT: s_endpgm
+;
+; GFX1064-THRESH5-LABEL: add_i32_varying:
+; GFX1064-THRESH5: ; %bb.0: ; %entry
+; GFX1064-THRESH5-NEXT: s_bcnt1_i32_b64 s0, exec
+; GFX1064-THRESH5-NEXT: s_cmp_lt_u32 s0, 6
+; GFX1064-THRESH5-NEXT: s_cbranch_scc0 .LBB0_2
+; GFX1064-THRESH5-NEXT: ; %bb.1: ; %atomicrmw.no_opt
+; GFX1064-THRESH5-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-THRESH5-NEXT: ds_add_rtn_u32 v4, v4, v0
+; GFX1064-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-THRESH5-NEXT: buffer_gl0_inv
+; GFX1064-THRESH5-NEXT: s_cbranch_execz .LBB0_3
+; GFX1064-THRESH5-NEXT: s_branch .LBB0_6
+; GFX1064-THRESH5-NEXT: .LBB0_2:
+; GFX1064-THRESH5-NEXT: ; implicit-def: $vgpr4
+; GFX1064-THRESH5-NEXT: .LBB0_3:
+; GFX1064-THRESH5-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-THRESH5-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1064-THRESH5-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-THRESH5-NEXT: v_permlanex16_b32 v2, v1, -1, -1
+; GFX1064-THRESH5-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064-THRESH5-NEXT: v_readlane_b32 s2, v1, 31
+; GFX1064-THRESH5-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-THRESH5-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064-THRESH5-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064-THRESH5-NEXT: v_readlane_b32 s2, v1, 15
+; GFX1064-THRESH5-NEXT: v_readlane_b32 s3, v1, 31
+; GFX1064-THRESH5-NEXT: v_writelane_b32 v3, s2, 16
+; GFX1064-THRESH5-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-THRESH5-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-THRESH5-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-THRESH5-NEXT: v_readlane_b32 s2, v1, 63
+; GFX1064-THRESH5-NEXT: v_readlane_b32 s6, v1, 47
+; GFX1064-THRESH5-NEXT: v_writelane_b32 v3, s3, 32
+; GFX1064-THRESH5-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-THRESH5-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
+; GFX1064-THRESH5-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-THRESH5-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-THRESH5-NEXT: v_writelane_b32 v3, s6, 48
+; GFX1064-THRESH5-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-THRESH5-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX1064-THRESH5-NEXT: ; implicit-def: $vgpr4
+; GFX1064-THRESH5-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-THRESH5-NEXT: s_cbranch_execz .LBB0_5
+; GFX1064-THRESH5-NEXT: ; %bb.4:
+; GFX1064-THRESH5-NEXT: v_mov_b32_e32 v4, s2
+; GFX1064-THRESH5-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1064-THRESH5-NEXT: ds_add_rtn_u32 v4, v0, v4
+; GFX1064-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-THRESH5-NEXT: buffer_gl0_inv
+; GFX1064-THRESH5-NEXT: .LBB0_5:
+; GFX1064-THRESH5-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064-THRESH5-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-THRESH5-NEXT: v_readfirstlane_b32 s0, v4
+; GFX1064-THRESH5-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-THRESH5-NEXT: v_add_nc_u32_e32 v4, s0, v0
+; GFX1064-THRESH5-NEXT: .LBB0_6:
+; GFX1064-THRESH5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064-THRESH5-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064-THRESH5-NEXT: s_mov_b32 s2, -1
+; GFX1064-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-THRESH5-NEXT: buffer_store_dword v4, off, s[0:3], 0
+; GFX1064-THRESH5-NEXT: s_endpgm
+;
+; GFX900-THRESH5-LABEL: add_i32_varying:
+; GFX900-THRESH5: ; %bb.0: ; %entry
+; GFX900-THRESH5-NEXT: s_bcnt1_i32_b64 s0, exec
+; GFX900-THRESH5-NEXT: s_cmp_lt_u32 s0, 6
+; GFX900-THRESH5-NEXT: s_cbranch_scc0 .LBB0_2
+; GFX900-THRESH5-NEXT: ; %bb.1: ; %atomicrmw.no_opt
+; GFX900-THRESH5-NEXT: v_mov_b32_e32 v3, 0
+; GFX900-THRESH5-NEXT: ds_add_rtn_u32 v3, v3, v0
+; GFX900-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-THRESH5-NEXT: s_cbranch_execz .LBB0_3
+; GFX900-THRESH5-NEXT: s_branch .LBB0_6
+; GFX900-THRESH5-NEXT: .LBB0_2:
+; GFX900-THRESH5-NEXT: ; implicit-def: $vgpr3
+; GFX900-THRESH5-NEXT: .LBB0_3:
+; GFX900-THRESH5-NEXT: v_mov_b32_e32 v3, 0
+; GFX900-THRESH5-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX900-THRESH5-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX900-THRESH5-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX900-THRESH5-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX900-THRESH5-NEXT: v_mov_b32_e32 v2, 0
+; GFX900-THRESH5-NEXT: s_nop 0
+; GFX900-THRESH5-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX900-THRESH5-NEXT: s_nop 1
+; GFX900-THRESH5-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX900-THRESH5-NEXT: s_nop 1
+; GFX900-THRESH5-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX900-THRESH5-NEXT: s_nop 1
+; GFX900-THRESH5-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX900-THRESH5-NEXT: s_nop 1
+; GFX900-THRESH5-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX900-THRESH5-NEXT: s_nop 1
+; GFX900-THRESH5-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX900-THRESH5-NEXT: v_readlane_b32 s2, v1, 63
+; GFX900-THRESH5-NEXT: s_nop 0
+; GFX900-THRESH5-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX900-THRESH5-NEXT: s_mov_b64 exec, s[0:1]
+; GFX900-THRESH5-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX900-THRESH5-NEXT: ; implicit-def: $vgpr0
+; GFX900-THRESH5-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX900-THRESH5-NEXT: s_cbranch_execz .LBB0_5
+; GFX900-THRESH5-NEXT: ; %bb.4:
+; GFX900-THRESH5-NEXT: v_mov_b32_e32 v0, s2
+; GFX900-THRESH5-NEXT: ds_add_rtn_u32 v0, v3, v0
+; GFX900-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-THRESH5-NEXT: .LBB0_5:
+; GFX900-THRESH5-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX900-THRESH5-NEXT: v_readfirstlane_b32 s0, v0
+; GFX900-THRESH5-NEXT: v_mov_b32_e32 v0, v2
+; GFX900-THRESH5-NEXT: v_add_u32_e32 v3, s0, v0
+; GFX900-THRESH5-NEXT: .LBB0_6:
+; GFX900-THRESH5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-THRESH5-NEXT: s_mov_b32 s3, 0xf000
+; GFX900-THRESH5-NEXT: s_mov_b32 s2, -1
+; GFX900-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-THRESH5-NEXT: buffer_store_dword v3, off, s[0:3], 0
+; GFX900-THRESH5-NEXT: s_endpgm
+;
+; GFX1032-DISABLED-LABEL: add_i32_varying:
+; GFX1032-DISABLED: ; %bb.0: ; %entry
+; GFX1032-DISABLED-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DISABLED-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1032-DISABLED-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DISABLED-NEXT: v_permlanex16_b32 v2, v1, -1, -1
+; GFX1032-DISABLED-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032-DISABLED-NEXT: v_readlane_b32 s1, v1, 15
+; GFX1032-DISABLED-NEXT: v_readlane_b32 s2, v1, 31
+; GFX1032-DISABLED-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DISABLED-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DISABLED-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DISABLED-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-DISABLED-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DISABLED-NEXT: v_writelane_b32 v3, s1, 16
+; GFX1032-DISABLED-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DISABLED-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DISABLED-NEXT: s_mov_b32 s0, s2
+; GFX1032-DISABLED-NEXT: s_mov_b32 s2, -1
+; GFX1032-DISABLED-NEXT: ; implicit-def: $vgpr0
+; GFX1032-DISABLED-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032-DISABLED-NEXT: s_cbranch_execz .LBB0_2
+; GFX1032-DISABLED-NEXT: ; %bb.1:
+; GFX1032-DISABLED-NEXT: v_mov_b32_e32 v0, s0
+; GFX1032-DISABLED-NEXT: ds_add_rtn_u32 v0, v4, v0
+; GFX1032-DISABLED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DISABLED-NEXT: buffer_gl0_inv
+; GFX1032-DISABLED-NEXT: .LBB0_2:
+; GFX1032-DISABLED-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032-DISABLED-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032-DISABLED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032-DISABLED-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1032-DISABLED-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DISABLED-NEXT: v_add_nc_u32_e32 v0, s3, v0
+; GFX1032-DISABLED-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032-DISABLED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DISABLED-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-DISABLED-NEXT: s_endpgm
+;
+; GFX1064-DISABLED-LABEL: add_i32_varying:
+; GFX1064-DISABLED: ; %bb.0: ; %entry
+; GFX1064-DISABLED-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DISABLED-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1064-DISABLED-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DISABLED-NEXT: v_permlanex16_b32 v2, v1, -1, -1
+; GFX1064-DISABLED-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064-DISABLED-NEXT: v_readlane_b32 s2, v1, 31
+; GFX1064-DISABLED-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-DISABLED-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064-DISABLED-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DISABLED-NEXT: v_readlane_b32 s2, v1, 15
+; GFX1064-DISABLED-NEXT: v_readlane_b32 s3, v1, 31
+; GFX1064-DISABLED-NEXT: v_writelane_b32 v3, s2, 16
+; GFX1064-DISABLED-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DISABLED-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DISABLED-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DISABLED-NEXT: v_readlane_b32 s2, v1, 47
+; GFX1064-DISABLED-NEXT: v_readlane_b32 s6, v1, 63
+; GFX1064-DISABLED-NEXT: v_writelane_b32 v3, s3, 32
+; GFX1064-DISABLED-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DISABLED-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DISABLED-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-DISABLED-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DISABLED-NEXT: v_writelane_b32 v3, s2, 48
+; GFX1064-DISABLED-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DISABLED-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DISABLED-NEXT: s_mov_b32 s2, -1
+; GFX1064-DISABLED-NEXT: ; implicit-def: $vgpr0
+; GFX1064-DISABLED-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DISABLED-NEXT: s_cbranch_execz .LBB0_2
+; GFX1064-DISABLED-NEXT: ; %bb.1:
+; GFX1064-DISABLED-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064-DISABLED-NEXT: ds_add_rtn_u32 v0, v4, v0
+; GFX1064-DISABLED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DISABLED-NEXT: buffer_gl0_inv
+; GFX1064-DISABLED-NEXT: .LBB0_2:
+; GFX1064-DISABLED-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064-DISABLED-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-DISABLED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064-DISABLED-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1064-DISABLED-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DISABLED-NEXT: v_add_nc_u32_e32 v0, s3, v0
+; GFX1064-DISABLED-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064-DISABLED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DISABLED-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-DISABLED-NEXT: s_endpgm
+entry:
+ %lane = call i32 @llvm.amdgcn.workitem.id.x()
+ %old = atomicrmw add ptr addrspace(3) @local_var32, i32 %lane acq_rel
+ store i32 %old, ptr addrspace(1) %out
+ ret void
+}
+
+; Test 2: divergent i32 sub with result used
+define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
+; GFX1032-THRESH5-LABEL: sub_i32_varying:
+; GFX1032-THRESH5: ; %bb.0: ; %entry
+; GFX1032-THRESH5-NEXT: s_bcnt1_i32_b32 s0, exec_lo
+; GFX1032-THRESH5-NEXT: s_cmp_lt_u32 s0, 6
+; GFX1032-THRESH5-NEXT: s_cbranch_scc0 .LBB1_2
+; GFX1032-THRESH5-NEXT: ; %bb.1: ; %atomicrmw.no_opt
+; GFX1032-THRESH5-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-THRESH5-NEXT: ds_sub_rtn_u32 v4, v4, v0
+; GFX1032-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-THRESH5-NEXT: buffer_gl0_inv
+; GFX1032-THRESH5-NEXT: s_cbranch_execz .LBB1_3
+; GFX1032-THRESH5-NEXT: s_branch .LBB1_6
+; GFX1032-THRESH5-NEXT: .LBB1_2:
+; GFX1032-THRESH5-NEXT: ; implicit-def: $vgpr4
+; GFX1032-THRESH5-NEXT: .LBB1_3:
+; GFX1032-THRESH5-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-THRESH5-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1032-THRESH5-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-THRESH5-NEXT: v_permlanex16_b32 v2, v1, -1, -1
+; GFX1032-THRESH5-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032-THRESH5-NEXT: v_readlane_b32 s2, v1, 31
+; GFX1032-THRESH5-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032-THRESH5-NEXT: v_readlane_b32 s1, v1, 15
+; GFX1032-THRESH5-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-THRESH5-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1032-THRESH5-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-THRESH5-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-THRESH5-NEXT: v_writelane_b32 v3, s1, 16
+; GFX1032-THRESH5-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-THRESH5-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4
+; GFX1032-THRESH5-NEXT: ; implicit-def: $vgpr4
+; GFX1032-THRESH5-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-THRESH5-NEXT: s_cbranch_execz .LBB1_5
+; GFX1032-THRESH5-NEXT: ; %bb.4:
+; GFX1032-THRESH5-NEXT: v_mov_b32_e32 v4, s2
+; GFX1032-THRESH5-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1032-THRESH5-NEXT: ds_sub_rtn_u32 v4, v0, v4
+; GFX1032-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-THRESH5-NEXT: buffer_gl0_inv
+; GFX1032-THRESH5-NEXT: .LBB1_5:
+; GFX1032-THRESH5-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032-THRESH5-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-THRESH5-NEXT: v_readfirstlane_b32 s0, v4
+; GFX1032-THRESH5-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-THRESH5-NEXT: v_sub_nc_u32_e32 v4, s0, v0
+; GFX1032-THRESH5-NEXT: .LBB1_6:
+; GFX1032-THRESH5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032-THRESH5-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032-THRESH5-NEXT: s_mov_b32 s2, -1
+; GFX1032-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-THRESH5-NEXT: buffer_store_dword v4, off, s[0:3], 0
+; GFX1032-THRESH5-NEXT: s_endpgm
+;
+; GFX1064-THRESH5-LABEL: sub_i32_varying:
+; GFX1064-THRESH5: ; %bb.0: ; %entry
+; GFX1064-THRESH5-NEXT: s_bcnt1_i32_b64 s0, exec
+; GFX1064-THRESH5-NEXT: s_cmp_lt_u32 s0, 6
+; GFX1064-THRESH5-NEXT: s_cbranch_scc0 .LBB1_2
+; GFX1064-THRESH5-NEXT: ; %bb.1: ; %atomicrmw.no_opt
+; GFX1064-THRESH5-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-THRESH5-NEXT: ds_sub_rtn_u32 v4, v4, v0
+; GFX1064-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-THRESH5-NEXT: buffer_gl0_inv
+; GFX1064-THRESH5-NEXT: s_cbranch_execz .LBB1_3
+; GFX1064-THRESH5-NEXT: s_branch .LBB1_6
+; GFX1064-THRESH5-NEXT: .LBB1_2:
+; GFX1064-THRESH5-NEXT: ; implicit-def: $vgpr4
+; GFX1064-THRESH5-NEXT: .LBB1_3:
+; GFX1064-THRESH5-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-THRESH5-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1064-THRESH5-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-THRESH5-NEXT: v_permlanex16_b32 v2, v1, -1, -1
+; GFX1064-THRESH5-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064-THRESH5-NEXT: v_readlane_b32 s2, v1, 31
+; GFX1064-THRESH5-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-THRESH5-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064-THRESH5-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064-THRESH5-NEXT: v_readlane_b32 s2, v1, 15
+; GFX1064-THRESH5-NEXT: v_readlane_b32 s3, v1, 31
+; GFX1064-THRESH5-NEXT: v_writelane_b32 v3, s2, 16
+; GFX1064-THRESH5-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-THRESH5-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-THRESH5-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-THRESH5-NEXT: v_readlane_b32 s2, v1, 63
+; GFX1064-THRESH5-NEXT: v_readlane_b32 s6, v1, 47
+; GFX1064-THRESH5-NEXT: v_writelane_b32 v3, s3, 32
+; GFX1064-THRESH5-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-THRESH5-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
+; GFX1064-THRESH5-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-THRESH5-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-THRESH5-NEXT: v_writelane_b32 v3, s6, 48
+; GFX1064-THRESH5-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-THRESH5-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX1064-THRESH5-NEXT: ; implicit-def: $vgpr4
+; GFX1064-THRESH5-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-THRESH5-NEXT: s_cbranch_execz .LBB1_5
+; GFX1064-THRESH5-NEXT: ; %bb.4:
+; GFX1064-THRESH5-NEXT: v_mov_b32_e32 v4, s2
+; GFX1064-THRESH5-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1064-THRESH5-NEXT: ds_sub_rtn_u32 v4, v0, v4
+; GFX1064-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-THRESH5-NEXT: buffer_gl0_inv
+; GFX1064-THRESH5-NEXT: .LBB1_5:
+; GFX1064-THRESH5-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064-THRESH5-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-THRESH5-NEXT: v_readfirstlane_b32 s0, v4
+; GFX1064-THRESH5-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-THRESH5-NEXT: v_sub_nc_u32_e32 v4, s0, v0
+; GFX1064-THRESH5-NEXT: .LBB1_6:
+; GFX1064-THRESH5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064-THRESH5-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064-THRESH5-NEXT: s_mov_b32 s2, -1
+; GFX1064-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-THRESH5-NEXT: buffer_store_dword v4, off, s[0:3], 0
+; GFX1064-THRESH5-NEXT: s_endpgm
+;
+; GFX900-THRESH5-LABEL: sub_i32_varying:
+; GFX900-THRESH5: ; %bb.0: ; %entry
+; GFX900-THRESH5-NEXT: s_bcnt1_i32_b64 s0, exec
+; GFX900-THRESH5-NEXT: s_cmp_lt_u32 s0, 6
+; GFX900-THRESH5-NEXT: s_cbranch_scc0 .LBB1_2
+; GFX900-THRESH5-NEXT: ; %bb.1: ; %atomicrmw.no_opt
+; GFX900-THRESH5-NEXT: v_mov_b32_e32 v3, 0
+; GFX900-THRESH5-NEXT: ds_sub_rtn_u32 v3, v3, v0
+; GFX900-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-THRESH5-NEXT: s_cbranch_execz .LBB1_3
+; GFX900-THRESH5-NEXT: s_branch .LBB1_6
+; GFX900-THRESH5-NEXT: .LBB1_2:
+; GFX900-THRESH5-NEXT: ; implicit-def: $vgpr3
+; GFX900-THRESH5-NEXT: .LBB1_3:
+; GFX900-THRESH5-NEXT: v_mov_b32_e32 v3, 0
+; GFX900-THRESH5-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX900-THRESH5-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX900-THRESH5-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX900-THRESH5-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX900-THRESH5-NEXT: v_mov_b32_e32 v2, 0
+; GFX900-THRESH5-NEXT: s_nop 0
+; GFX900-THRESH5-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX900-THRESH5-NEXT: s_nop 1
+; GFX900-THRESH5-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX900-THRESH5-NEXT: s_nop 1
+; GFX900-THRESH5-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX900-THRESH5-NEXT: s_nop 1
+; GFX900-THRESH5-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX900-THRESH5-NEXT: s_nop 1
+; GFX900-THRESH5-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX900-THRESH5-NEXT: s_nop 1
+; GFX900-THRESH5-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX900-THRESH5-NEXT: v_readlane_b32 s2, v1, 63
+; GFX900-THRESH5-NEXT: s_nop 0
+; GFX900-THRESH5-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX900-THRESH5-NEXT: s_mov_b64 exec, s[0:1]
+; GFX900-THRESH5-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX900-THRESH5-NEXT: ; implicit-def: $vgpr0
+; GFX900-THRESH5-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX900-THRESH5-NEXT: s_cbranch_execz .LBB1_5
+; GFX900-THRESH5-NEXT: ; %bb.4:
+; GFX900-THRESH5-NEXT: v_mov_b32_e32 v0, s2
+; GFX900-THRESH5-NEXT: ds_sub_rtn_u32 v0, v3, v0
+; GFX900-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-THRESH5-NEXT: .LBB1_5:
+; GFX900-THRESH5-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX900-THRESH5-NEXT: v_readfirstlane_b32 s0, v0
+; GFX900-THRESH5-NEXT: v_mov_b32_e32 v0, v2
+; GFX900-THRESH5-NEXT: v_sub_u32_e32 v3, s0, v0
+; GFX900-THRESH5-NEXT: .LBB1_6:
+; GFX900-THRESH5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-THRESH5-NEXT: s_mov_b32 s3, 0xf000
+; GFX900-THRESH5-NEXT: s_mov_b32 s2, -1
+; GFX900-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-THRESH5-NEXT: buffer_store_dword v3, off, s[0:3], 0
+; GFX900-THRESH5-NEXT: s_endpgm
+;
+; GFX1032-DISABLED-LABEL: sub_i32_varying:
+; GFX1032-DISABLED: ; %bb.0: ; %entry
+; GFX1032-DISABLED-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DISABLED-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1032-DISABLED-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DISABLED-NEXT: v_permlanex16_b32 v2, v1, -1, -1
+; GFX1032-DISABLED-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032-DISABLED-NEXT: v_readlane_b32 s1, v1, 15
+; GFX1032-DISABLED-NEXT: v_readlane_b32 s2, v1, 31
+; GFX1032-DISABLED-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DISABLED-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DISABLED-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DISABLED-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-DISABLED-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DISABLED-NEXT: v_writelane_b32 v3, s1, 16
+; GFX1032-DISABLED-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DISABLED-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DISABLED-NEXT: s_mov_b32 s0, s2
+; GFX1032-DISABLED-NEXT: s_mov_b32 s2, -1
+; GFX1032-DISABLED-NEXT: ; implicit-def: $vgpr0
+; GFX1032-DISABLED-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032-DISABLED-NEXT: s_cbranch_execz .LBB1_2
+; GFX1032-DISABLED-NEXT: ; %bb.1:
+; GFX1032-DISABLED-NEXT: v_mov_b32_e32 v0, s0
+; GFX1032-DISABLED-NEXT: ds_sub_rtn_u32 v0, v4, v0
+; GFX1032-DISABLED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DISABLED-NEXT: buffer_gl0_inv
+; GFX1032-DISABLED-NEXT: .LBB1_2:
+; GFX1032-DISABLED-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032-DISABLED-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032-DISABLED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032-DISABLED-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1032-DISABLED-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DISABLED-NEXT: v_sub_nc_u32_e32 v0, s3, v0
+; GFX1032-DISABLED-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032-DISABLED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DISABLED-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-DISABLED-NEXT: s_endpgm
+;
+; GFX1064-DISABLED-LABEL: sub_i32_varying:
+; GFX1064-DISABLED: ; %bb.0: ; %entry
+; GFX1064-DISABLED-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DISABLED-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1064-DISABLED-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DISABLED-NEXT: v_permlanex16_b32 v2, v1, -1, -1
+; GFX1064-DISABLED-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064-DISABLED-NEXT: v_readlane_b32 s2, v1, 31
+; GFX1064-DISABLED-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-DISABLED-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064-DISABLED-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DISABLED-NEXT: v_readlane_b32 s2, v1, 15
+; GFX1064-DISABLED-NEXT: v_readlane_b32 s3, v1, 31
+; GFX1064-DISABLED-NEXT: v_writelane_b32 v3, s2, 16
+; GFX1064-DISABLED-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DISABLED-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DISABLED-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DISABLED-NEXT: v_readlane_b32 s2, v1, 47
+; GFX1064-DISABLED-NEXT: v_readlane_b32 s6, v1, 63
+; GFX1064-DISABLED-NEXT: v_writelane_b32 v3, s3, 32
+; GFX1064-DISABLED-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DISABLED-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DISABLED-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-DISABLED-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DISABLED-NEXT: v_writelane_b32 v3, s2, 48
+; GFX1064-DISABLED-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DISABLED-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DISABLED-NEXT: s_mov_b32 s2, -1
+; GFX1064-DISABLED-NEXT: ; implicit-def: $vgpr0
+; GFX1064-DISABLED-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DISABLED-NEXT: s_cbranch_execz .LBB1_2
+; GFX1064-DISABLED-NEXT: ; %bb.1:
+; GFX1064-DISABLED-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064-DISABLED-NEXT: ds_sub_rtn_u32 v0, v4, v0
+; GFX1064-DISABLED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DISABLED-NEXT: buffer_gl0_inv
+; GFX1064-DISABLED-NEXT: .LBB1_2:
+; GFX1064-DISABLED-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064-DISABLED-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-DISABLED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064-DISABLED-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1064-DISABLED-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DISABLED-NEXT: v_sub_nc_u32_e32 v0, s3, v0
+; GFX1064-DISABLED-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064-DISABLED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DISABLED-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-DISABLED-NEXT: s_endpgm
+entry:
+ %lane = call i32 @llvm.amdgcn.workitem.id.x()
+ %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 %lane acq_rel
+ store i32 %old, ptr addrspace(1) %out
+ ret void
+}
+
+; Test 3: divergent i32 add without result used (NeedResult=false)
+define amdgpu_kernel void @add_i32_varying_nouse() {
+; GFX1032-THRESH5-LABEL: add_i32_varying_nouse:
+; GFX1032-THRESH5: ; %bb.0: ; %entry
+; GFX1032-THRESH5-NEXT: s_bcnt1_i32_b32 s0, exec_lo
+; GFX1032-THRESH5-NEXT: s_cmp_lt_u32 s0, 6
+; GFX1032-THRESH5-NEXT: s_mov_b32 s0, -1
+; GFX1032-THRESH5-NEXT: s_cbranch_scc0 .LBB2_2
+; GFX1032-THRESH5-NEXT: ; %bb.1: ; %atomicrmw.no_opt
+; GFX1032-THRESH5-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-THRESH5-NEXT: s_mov_b32 s0, 0
+; GFX1032-THRESH5-NEXT: ds_add_u32 v3, v0
+; GFX1032-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-THRESH5-NEXT: buffer_gl0_inv
+; GFX1032-THRESH5-NEXT: .LBB2_2: ; %Flow1
+; GFX1032-THRESH5-NEXT: s_andn2_b32 vcc_lo, exec_lo, s0
+; GFX1032-THRESH5-NEXT: s_cbranch_vccnz .LBB2_5
+; GFX1032-THRESH5-NEXT: ; %bb.3:
+; GFX1032-THRESH5-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-THRESH5-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1032-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-THRESH5-NEXT: v_permlanex16_b32 v2, v1, 0, 0
+; GFX1032-THRESH5-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX1032-THRESH5-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-THRESH5-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1032-THRESH5-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-THRESH5-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-THRESH5-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4
+; GFX1032-THRESH5-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-THRESH5-NEXT: s_cbranch_execz .LBB2_5
+; GFX1032-THRESH5-NEXT: ; %bb.4:
+; GFX1032-THRESH5-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1032-THRESH5-NEXT: ds_add_u32 v0, v3
+; GFX1032-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-THRESH5-NEXT: buffer_gl0_inv
+; GFX1032-THRESH5-NEXT: .LBB2_5:
+; GFX1032-THRESH5-NEXT: s_endpgm
+;
+; GFX1064-THRESH5-LABEL: add_i32_varying_nouse:
+; GFX1064-THRESH5: ; %bb.0: ; %entry
+; GFX1064-THRESH5-NEXT: s_bcnt1_i32_b64 s0, exec
+; GFX1064-THRESH5-NEXT: s_cmp_lt_u32 s0, 6
+; GFX1064-THRESH5-NEXT: s_mov_b64 s[0:1], -1
+; GFX1064-THRESH5-NEXT: s_cbranch_scc0 .LBB2_2
+; GFX1064-THRESH5-NEXT: ; %bb.1: ; %atomicrmw.no_opt
+; GFX1064-THRESH5-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-THRESH5-NEXT: s_mov_b64 s[0:1], 0
+; GFX1064-THRESH5-NEXT: ds_add_u32 v3, v0
+; GFX1064-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-THRESH5-NEXT: buffer_gl0_inv
+; GFX1064-THRESH5-NEXT: .LBB2_2: ; %Flow1
+; GFX1064-THRESH5-NEXT: s_andn2_b64 vcc, exec, s[0:1]
+; GFX1064-THRESH5-NEXT: s_cbranch_vccnz .LBB2_5
+; GFX1064-THRESH5-NEXT: ; %bb.3:
+; GFX1064-THRESH5-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-THRESH5-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1064-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-THRESH5-NEXT: v_permlanex16_b32 v2, v1, 0, 0
+; GFX1064-THRESH5-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX1064-THRESH5-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-THRESH5-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-THRESH5-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-THRESH5-NEXT: v_readlane_b32 s2, v1, 0
+; GFX1064-THRESH5-NEXT: v_readlane_b32 s3, v1, 32
+; GFX1064-THRESH5-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-THRESH5-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0
+; GFX1064-THRESH5-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-THRESH5-NEXT: s_add_i32 s0, s2, s3
+; GFX1064-THRESH5-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX1064-THRESH5-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-THRESH5-NEXT: s_cbranch_execz .LBB2_5
+; GFX1064-THRESH5-NEXT: ; %bb.4:
+; GFX1064-THRESH5-NEXT: v_mov_b32_e32 v3, s0
+; GFX1064-THRESH5-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1064-THRESH5-NEXT: ds_add_u32 v0, v3
+; GFX1064-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-THRESH5-NEXT: buffer_gl0_inv
+; GFX1064-THRESH5-NEXT: .LBB2_5:
+; GFX1064-THRESH5-NEXT: s_endpgm
+;
+; GFX900-THRESH5-LABEL: add_i32_varying_nouse:
+; GFX900-THRESH5: ; %bb.0: ; %entry
+; GFX900-THRESH5-NEXT: s_bcnt1_i32_b64 s0, exec
+; GFX900-THRESH5-NEXT: s_cmp_lt_u32 s0, 6
+; GFX900-THRESH5-NEXT: s_mov_b64 s[0:1], -1
+; GFX900-THRESH5-NEXT: s_cbranch_scc0 .LBB2_2
+; GFX900-THRESH5-NEXT: ; %bb.1: ; %atomicrmw.no_opt
+; GFX900-THRESH5-NEXT: v_mov_b32_e32 v2, 0
+; GFX900-THRESH5-NEXT: ds_add_u32 v2, v0
+; GFX900-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-THRESH5-NEXT: s_mov_b64 s[0:1], 0
+; GFX900-THRESH5-NEXT: .LBB2_2: ; %Flow1
+; GFX900-THRESH5-NEXT: s_andn2_b64 vcc, exec, s[0:1]
+; GFX900-THRESH5-NEXT: s_cbranch_vccnz .LBB2_5
+; GFX900-THRESH5-NEXT: ; %bb.3:
+; GFX900-THRESH5-NEXT: v_mov_b32_e32 v2, 0
+; GFX900-THRESH5-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX900-THRESH5-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX900-THRESH5-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX900-THRESH5-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX900-THRESH5-NEXT: s_nop 1
+; GFX900-THRESH5-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX900-THRESH5-NEXT: s_nop 1
+; GFX900-THRESH5-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX900-THRESH5-NEXT: s_nop 1
+; GFX900-THRESH5-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX900-THRESH5-NEXT: s_nop 1
+; GFX900-THRESH5-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX900-THRESH5-NEXT: s_nop 1
+; GFX900-THRESH5-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX900-THRESH5-NEXT: s_nop 1
+; GFX900-THRESH5-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX900-THRESH5-NEXT: v_readlane_b32 s2, v1, 63
+; GFX900-THRESH5-NEXT: s_mov_b64 exec, s[0:1]
+; GFX900-THRESH5-NEXT: s_mov_b32 s0, s2
+; GFX900-THRESH5-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX900-THRESH5-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX900-THRESH5-NEXT: s_cbranch_execz .LBB2_5
+; GFX900-THRESH5-NEXT: ; %bb.4:
+; GFX900-THRESH5-NEXT: v_mov_b32_e32 v0, s0
+; GFX900-THRESH5-NEXT: ds_add_u32 v2, v0
+; GFX900-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-THRESH5-NEXT: .LBB2_5:
+; GFX900-THRESH5-NEXT: s_endpgm
+;
+; GFX1032-DISABLED-LABEL: add_i32_varying_nouse:
+; GFX1032-DISABLED: ; %bb.0: ; %entry
+; GFX1032-DISABLED-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DISABLED-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1032-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DISABLED-NEXT: v_permlanex16_b32 v2, v1, 0, 0
+; GFX1032-DISABLED-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX1032-DISABLED-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DISABLED-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1032-DISABLED-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-DISABLED-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-DISABLED-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4
+; GFX1032-DISABLED-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DISABLED-NEXT: s_cbranch_execz .LBB2_2
+; GFX1032-DISABLED-NEXT: ; %bb.1:
+; GFX1032-DISABLED-NEXT: ds_add_u32 v0, v3
+; GFX1032-DISABLED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DISABLED-NEXT: buffer_gl0_inv
+; GFX1032-DISABLED-NEXT: .LBB2_2:
+; GFX1032-DISABLED-NEXT: s_endpgm
+;
+; GFX1064-DISABLED-LABEL: add_i32_varying_nouse:
+; GFX1064-DISABLED: ; %bb.0: ; %entry
+; GFX1064-DISABLED-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DISABLED-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1064-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DISABLED-NEXT: v_permlanex16_b32 v2, v1, 0, 0
+; GFX1064-DISABLED-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX1064-DISABLED-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DISABLED-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DISABLED-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DISABLED-NEXT: v_readlane_b32 s2, v1, 0
+; GFX1064-DISABLED-NEXT: v_readlane_b32 s3, v1, 32
+; GFX1064-DISABLED-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DISABLED-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0
+; GFX1064-DISABLED-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-DISABLED-NEXT: s_add_i32 s0, s2, s3
+; GFX1064-DISABLED-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX1064-DISABLED-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-DISABLED-NEXT: s_cbranch_execz .LBB2_2
+; GFX1064-DISABLED-NEXT: ; %bb.1:
+; GFX1064-DISABLED-NEXT: v_mov_b32_e32 v3, s0
+; GFX1064-DISABLED-NEXT: ds_add_u32 v0, v3
+; GFX1064-DISABLED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DISABLED-NEXT: buffer_gl0_inv
+; GFX1064-DISABLED-NEXT: .LBB2_2:
+; GFX1064-DISABLED-NEXT: s_endpgm
+entry:
+ %lane = call i32 @llvm.amdgcn.workitem.id.x()
+ %old = atomicrmw add ptr addrspace(3) @local_var32, i32 %lane acq_rel
+ ret void
+}
+
+; Test 4: uniform i32 add -- should NOT be affected by threshold (not divergent)
+define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) {
+; GFX1032-THRESH5-LABEL: add_i32_uniform:
+; GFX1032-THRESH5: ; %bb.0: ; %entry
+; GFX1032-THRESH5-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GFX1032-THRESH5-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032-THRESH5-NEXT: ; implicit-def: $vgpr1
+; GFX1032-THRESH5-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1032-THRESH5-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-THRESH5-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032-THRESH5-NEXT: s_cbranch_execz .LBB3_2
+; GFX1032-THRESH5-NEXT: ; %bb.1:
+; GFX1032-THRESH5-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-THRESH5-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-THRESH5-NEXT: s_mul_i32 s2, s0, s2
+; GFX1032-THRESH5-NEXT: v_mov_b32_e32 v2, s2
+; GFX1032-THRESH5-NEXT: ds_add_rtn_u32 v1, v1, v2
+; GFX1032-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-THRESH5-NEXT: buffer_gl0_inv
+; GFX1032-THRESH5-NEXT: .LBB3_2:
+; GFX1032-THRESH5-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032-THRESH5-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032-THRESH5-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24
+; GFX1032-THRESH5-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1032-THRESH5-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1032-THRESH5-NEXT: s_mov_b32 s10, -1
+; GFX1032-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-THRESH5-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3]
+; GFX1032-THRESH5-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GFX1032-THRESH5-NEXT: s_endpgm
+;
+; GFX1064-THRESH5-LABEL: add_i32_uniform:
+; GFX1064-THRESH5: ; %bb.0: ; %entry
+; GFX1064-THRESH5-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064-THRESH5-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-THRESH5-NEXT: ; implicit-def: $vgpr1
+; GFX1064-THRESH5-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-THRESH5-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-THRESH5-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-THRESH5-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-THRESH5-NEXT: s_cbranch_execz .LBB3_2
+; GFX1064-THRESH5-NEXT: ; %bb.1:
+; GFX1064-THRESH5-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-THRESH5-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-THRESH5-NEXT: s_mul_i32 s2, s6, s2
+; GFX1064-THRESH5-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-THRESH5-NEXT: ds_add_rtn_u32 v1, v1, v2
+; GFX1064-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-THRESH5-NEXT: buffer_gl0_inv
+; GFX1064-THRESH5-NEXT: .LBB3_2:
+; GFX1064-THRESH5-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064-THRESH5-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-THRESH5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064-THRESH5-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1064-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-THRESH5-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3]
+; GFX1064-THRESH5-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064-THRESH5-NEXT: s_mov_b32 s2, -1
+; GFX1064-THRESH5-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-THRESH5-NEXT: s_endpgm
+;
+; GFX900-THRESH5-LABEL: add_i32_uniform:
+; GFX900-THRESH5: ; %bb.0: ; %entry
+; GFX900-THRESH5-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX900-THRESH5-NEXT: s_mov_b64 s[2:3], exec
+; GFX900-THRESH5-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX900-THRESH5-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX900-THRESH5-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-THRESH5-NEXT: ; implicit-def: $vgpr1
+; GFX900-THRESH5-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX900-THRESH5-NEXT: s_cbranch_execz .LBB3_2
+; GFX900-THRESH5-NEXT: ; %bb.1:
+; GFX900-THRESH5-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX900-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-THRESH5-NEXT: s_mul_i32 s2, s6, s2
+; GFX900-THRESH5-NEXT: v_mov_b32_e32 v1, 0
+; GFX900-THRESH5-NEXT: v_mov_b32_e32 v2, s2
+; GFX900-THRESH5-NEXT: ds_add_rtn_u32 v1, v1, v2
+; GFX900-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-THRESH5-NEXT: .LBB3_2:
+; GFX900-THRESH5-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX900-THRESH5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-THRESH5-NEXT: v_mul_lo_u32 v0, s6, v0
+; GFX900-THRESH5-NEXT: v_readfirstlane_b32 s4, v1
+; GFX900-THRESH5-NEXT: s_mov_b32 s3, 0xf000
+; GFX900-THRESH5-NEXT: s_mov_b32 s2, -1
+; GFX900-THRESH5-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX900-THRESH5-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX900-THRESH5-NEXT: s_endpgm
+;
+; GFX1032-DISABLED-LABEL: add_i32_uniform:
+; GFX1032-DISABLED: ; %bb.0: ; %entry
+; GFX1032-DISABLED-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GFX1032-DISABLED-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032-DISABLED-NEXT: ; implicit-def: $vgpr1
+; GFX1032-DISABLED-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1032-DISABLED-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DISABLED-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032-DISABLED-NEXT: s_cbranch_execz .LBB3_2
+; GFX1032-DISABLED-NEXT: ; %bb.1:
+; GFX1032-DISABLED-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-DISABLED-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-DISABLED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DISABLED-NEXT: s_mul_i32 s2, s0, s2
+; GFX1032-DISABLED-NEXT: v_mov_b32_e32 v2, s2
+; GFX1032-DISABLED-NEXT: ds_add_rtn_u32 v1, v1, v2
+; GFX1032-DISABLED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DISABLED-NEXT: buffer_gl0_inv
+; GFX1032-DISABLED-NEXT: .LBB3_2:
+; GFX1032-DISABLED-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032-DISABLED-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032-DISABLED-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24
+; GFX1032-DISABLED-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1032-DISABLED-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1032-DISABLED-NEXT: s_mov_b32 s10, -1
+; GFX1032-DISABLED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DISABLED-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3]
+; GFX1032-DISABLED-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GFX1032-DISABLED-NEXT: s_endpgm
+;
+; GFX1064-DISABLED-LABEL: add_i32_uniform:
+; GFX1064-DISABLED: ; %bb.0: ; %entry
+; GFX1064-DISABLED-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064-DISABLED-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-DISABLED-NEXT: ; implicit-def: $vgpr1
+; GFX1064-DISABLED-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-DISABLED-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DISABLED-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DISABLED-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DISABLED-NEXT: s_cbranch_execz .LBB3_2
+; GFX1064-DISABLED-NEXT: ; %bb.1:
+; GFX1064-DISABLED-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DISABLED-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-DISABLED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DISABLED-NEXT: s_mul_i32 s2, s6, s2
+; GFX1064-DISABLED-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-DISABLED-NEXT: ds_add_rtn_u32 v1, v1, v2
+; GFX1064-DISABLED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DISABLED-NEXT: buffer_gl0_inv
+; GFX1064-DISABLED-NEXT: .LBB3_2:
+; GFX1064-DISABLED-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064-DISABLED-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-DISABLED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064-DISABLED-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1064-DISABLED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DISABLED-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3]
+; GFX1064-DISABLED-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064-DISABLED-NEXT: s_mov_b32 s2, -1
+; GFX1064-DISABLED-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-DISABLED-NEXT: s_endpgm
+entry:
+ %old = atomicrmw add ptr addrspace(3) @local_var32, i32 %additive acq_rel
+ store i32 %old, ptr addrspace(1) %out
+ ret void
+}
+
+; Test 5: divergent float fadd -- should NOT be affected by threshold (FP excluded)
+define amdgpu_kernel void @fadd_f32_varying(ptr addrspace(1) %out) {
+; GFX1032-THRESH5-LABEL: fadd_f32_varying:
+; GFX1032-THRESH5: ; %bb.0: ; %entry
+; GFX1032-THRESH5-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX1032-THRESH5-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-THRESH5-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX1032-THRESH5-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0
+; GFX1032-THRESH5-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1032-THRESH5-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032-THRESH5-NEXT: v_add_f32_e32 v1, v2, v1
+; GFX1032-THRESH5-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1032-THRESH5-NEXT: v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032-THRESH5-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX1032-THRESH5-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1032-THRESH5-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032-THRESH5-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1032-THRESH5-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1032-THRESH5-NEXT: v_mov_b32_dpp v3, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032-THRESH5-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX1032-THRESH5-NEXT: v_permlanex16_b32 v3, v1, -1, -1
+; GFX1032-THRESH5-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032-THRESH5-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1032-THRESH5-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1032-THRESH5-NEXT: v_readlane_b32 s1, v1, 15
+; GFX1032-THRESH5-NEXT: v_readlane_b32 s2, v1, 31
+; GFX1032-THRESH5-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032-THRESH5-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-THRESH5-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-THRESH5-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-THRESH5-NEXT: v_writelane_b32 v3, s1, 16
+; GFX1032-THRESH5-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-THRESH5-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-THRESH5-NEXT: s_mov_b32 s0, s2
+; GFX1032-THRESH5-NEXT: s_mov_b32 s2, -1
+; GFX1032-THRESH5-NEXT: ; implicit-def: $vgpr0
+; GFX1032-THRESH5-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032-THRESH5-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032-THRESH5-NEXT: ; %bb.1:
+; GFX1032-THRESH5-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-THRESH5-NEXT: v_mov_b32_e32 v4, s0
+; GFX1032-THRESH5-NEXT: ds_add_rtn_f32 v0, v0, v4
+; GFX1032-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-THRESH5-NEXT: buffer_gl0_inv
+; GFX1032-THRESH5-NEXT: .LBB4_2:
+; GFX1032-THRESH5-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032-THRESH5-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032-THRESH5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032-THRESH5-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1032-THRESH5-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-THRESH5-NEXT: v_add_f32_e32 v0, s3, v0
+; GFX1032-THRESH5-NEXT: v_cndmask_b32_e64 v0, v0, s3, vcc_lo
+; GFX1032-THRESH5-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-THRESH5-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-THRESH5-NEXT: s_endpgm
+;
+; GFX1064-THRESH5-LABEL: fadd_f32_varying:
+; GFX1064-THRESH5: ; %bb.0: ; %entry
+; GFX1064-THRESH5-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX1064-THRESH5-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-THRESH5-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX1064-THRESH5-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
+; GFX1064-THRESH5-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1064-THRESH5-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064-THRESH5-NEXT: v_add_f32_e32 v1, v2, v1
+; GFX1064-THRESH5-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1064-THRESH5-NEXT: v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064-THRESH5-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX1064-THRESH5-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1064-THRESH5-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064-THRESH5-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1064-THRESH5-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1064-THRESH5-NEXT: v_mov_b32_dpp v3, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064-THRESH5-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX1064-THRESH5-NEXT: v_permlanex16_b32 v3, v1, -1, -1
+; GFX1064-THRESH5-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064-THRESH5-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1064-THRESH5-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1064-THRESH5-NEXT: v_readlane_b32 s2, v1, 31
+; GFX1064-THRESH5-NEXT: v_mov_b32_e32 v3, s2
+; GFX1064-THRESH5-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064-THRESH5-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1064-THRESH5-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1064-THRESH5-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064-THRESH5-NEXT: v_readlane_b32 s2, v1, 15
+; GFX1064-THRESH5-NEXT: v_readlane_b32 s3, v1, 31
+; GFX1064-THRESH5-NEXT: v_writelane_b32 v3, s2, 16
+; GFX1064-THRESH5-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-THRESH5-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-THRESH5-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-THRESH5-NEXT: v_readlane_b32 s2, v1, 47
+; GFX1064-THRESH5-NEXT: v_readlane_b32 s6, v1, 63
+; GFX1064-THRESH5-NEXT: v_writelane_b32 v3, s3, 32
+; GFX1064-THRESH5-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-THRESH5-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-THRESH5-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-THRESH5-NEXT: v_writelane_b32 v3, s2, 48
+; GFX1064-THRESH5-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-THRESH5-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-THRESH5-NEXT: s_mov_b32 s2, -1
+; GFX1064-THRESH5-NEXT: ; implicit-def: $vgpr0
+; GFX1064-THRESH5-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-THRESH5-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064-THRESH5-NEXT: ; %bb.1:
+; GFX1064-THRESH5-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-THRESH5-NEXT: v_mov_b32_e32 v4, s6
+; GFX1064-THRESH5-NEXT: ds_add_rtn_f32 v0, v0, v4
+; GFX1064-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-THRESH5-NEXT: buffer_gl0_inv
+; GFX1064-THRESH5-NEXT: .LBB4_2:
+; GFX1064-THRESH5-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064-THRESH5-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-THRESH5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064-THRESH5-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1064-THRESH5-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-THRESH5-NEXT: v_add_f32_e32 v0, s3, v0
+; GFX1064-THRESH5-NEXT: v_cndmask_b32_e64 v0, v0, s3, vcc
+; GFX1064-THRESH5-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-THRESH5-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-THRESH5-NEXT: s_endpgm
+;
+; GFX900-THRESH5-LABEL: fadd_f32_varying:
+; GFX900-THRESH5: ; %bb.0: ; %entry
+; GFX900-THRESH5-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX900-THRESH5-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX900-THRESH5-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX900-THRESH5-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX900-THRESH5-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX900-THRESH5-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX900-THRESH5-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX900-THRESH5-NEXT: s_nop 1
+; GFX900-THRESH5-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX900-THRESH5-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX900-THRESH5-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX900-THRESH5-NEXT: s_nop 1
+; GFX900-THRESH5-NEXT: v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX900-THRESH5-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX900-THRESH5-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX900-THRESH5-NEXT: s_nop 1
+; GFX900-THRESH5-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX900-THRESH5-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX900-THRESH5-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX900-THRESH5-NEXT: s_nop 1
+; GFX900-THRESH5-NEXT: v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX900-THRESH5-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX900-THRESH5-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX900-THRESH5-NEXT: s_nop 1
+; GFX900-THRESH5-NEXT: v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX900-THRESH5-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX900-THRESH5-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX900-THRESH5-NEXT: s_nop 1
+; GFX900-THRESH5-NEXT: v_mov_b32_dpp v3, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX900-THRESH5-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX900-THRESH5-NEXT: v_readlane_b32 s2, v2, 63
+; GFX900-THRESH5-NEXT: s_nop 0
+; GFX900-THRESH5-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX900-THRESH5-NEXT: s_mov_b64 exec, s[0:1]
+; GFX900-THRESH5-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX900-THRESH5-NEXT: ; implicit-def: $vgpr0
+; GFX900-THRESH5-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX900-THRESH5-NEXT: s_cbranch_execz .LBB4_2
+; GFX900-THRESH5-NEXT: ; %bb.1:
+; GFX900-THRESH5-NEXT: v_mov_b32_e32 v0, 0
+; GFX900-THRESH5-NEXT: v_mov_b32_e32 v4, s2
+; GFX900-THRESH5-NEXT: ds_add_rtn_f32 v0, v0, v4
+; GFX900-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-THRESH5-NEXT: .LBB4_2:
+; GFX900-THRESH5-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX900-THRESH5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-THRESH5-NEXT: v_readfirstlane_b32 s4, v0
+; GFX900-THRESH5-NEXT: v_mov_b32_e32 v0, v1
+; GFX900-THRESH5-NEXT: v_add_f32_e32 v0, s4, v0
+; GFX900-THRESH5-NEXT: v_mov_b32_e32 v4, s4
+; GFX900-THRESH5-NEXT: s_mov_b32 s3, 0xf000
+; GFX900-THRESH5-NEXT: s_mov_b32 s2, -1
+; GFX900-THRESH5-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX900-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-THRESH5-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX900-THRESH5-NEXT: s_endpgm
+;
+; GFX1032-DISABLED-LABEL: fadd_f32_varying:
+; GFX1032-DISABLED: ; %bb.0: ; %entry
+; GFX1032-DISABLED-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX1032-DISABLED-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DISABLED-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX1032-DISABLED-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0
+; GFX1032-DISABLED-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1032-DISABLED-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DISABLED-NEXT: v_add_f32_e32 v1, v2, v1
+; GFX1032-DISABLED-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1032-DISABLED-NEXT: v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DISABLED-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX1032-DISABLED-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1032-DISABLED-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DISABLED-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1032-DISABLED-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1032-DISABLED-NEXT: v_mov_b32_dpp v3, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DISABLED-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX1032-DISABLED-NEXT: v_permlanex16_b32 v3, v1, -1, -1
+; GFX1032-DISABLED-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032-DISABLED-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1032-DISABLED-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1032-DISABLED-NEXT: v_readlane_b32 s1, v1, 15
+; GFX1032-DISABLED-NEXT: v_readlane_b32 s2, v1, 31
+; GFX1032-DISABLED-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DISABLED-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DISABLED-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DISABLED-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DISABLED-NEXT: v_writelane_b32 v3, s1, 16
+; GFX1032-DISABLED-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DISABLED-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DISABLED-NEXT: s_mov_b32 s0, s2
+; GFX1032-DISABLED-NEXT: s_mov_b32 s2, -1
+; GFX1032-DISABLED-NEXT: ; implicit-def: $vgpr0
+; GFX1032-DISABLED-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032-DISABLED-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032-DISABLED-NEXT: ; %bb.1:
+; GFX1032-DISABLED-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-DISABLED-NEXT: v_mov_b32_e32 v4, s0
+; GFX1032-DISABLED-NEXT: ds_add_rtn_f32 v0, v0, v4
+; GFX1032-DISABLED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DISABLED-NEXT: buffer_gl0_inv
+; GFX1032-DISABLED-NEXT: .LBB4_2:
+; GFX1032-DISABLED-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032-DISABLED-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032-DISABLED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032-DISABLED-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1032-DISABLED-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DISABLED-NEXT: v_add_f32_e32 v0, s3, v0
+; GFX1032-DISABLED-NEXT: v_cndmask_b32_e64 v0, v0, s3, vcc_lo
+; GFX1032-DISABLED-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032-DISABLED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DISABLED-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-DISABLED-NEXT: s_endpgm
+;
+; GFX1064-DISABLED-LABEL: fadd_f32_varying:
+; GFX1064-DISABLED: ; %bb.0: ; %entry
+; GFX1064-DISABLED-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX1064-DISABLED-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DISABLED-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX1064-DISABLED-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
+; GFX1064-DISABLED-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1064-DISABLED-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DISABLED-NEXT: v_add_f32_e32 v1, v2, v1
+; GFX1064-DISABLED-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1064-DISABLED-NEXT: v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DISABLED-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX1064-DISABLED-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1064-DISABLED-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DISABLED-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1064-DISABLED-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1064-DISABLED-NEXT: v_mov_b32_dpp v3, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DISABLED-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX1064-DISABLED-NEXT: v_permlanex16_b32 v3, v1, -1, -1
+; GFX1064-DISABLED-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064-DISABLED-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1064-DISABLED-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1064-DISABLED-NEXT: v_readlane_b32 s2, v1, 31
+; GFX1064-DISABLED-NEXT: v_mov_b32_e32 v3, s2
+; GFX1064-DISABLED-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064-DISABLED-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1064-DISABLED-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1064-DISABLED-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DISABLED-NEXT: v_readlane_b32 s2, v1, 15
+; GFX1064-DISABLED-NEXT: v_readlane_b32 s3, v1, 31
+; GFX1064-DISABLED-NEXT: v_writelane_b32 v3, s2, 16
+; GFX1064-DISABLED-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DISABLED-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DISABLED-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DISABLED-NEXT: v_readlane_b32 s2, v1, 47
+; GFX1064-DISABLED-NEXT: v_readlane_b32 s6, v1, 63
+; GFX1064-DISABLED-NEXT: v_writelane_b32 v3, s3, 32
+; GFX1064-DISABLED-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DISABLED-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DISABLED-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DISABLED-NEXT: v_writelane_b32 v3, s2, 48
+; GFX1064-DISABLED-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DISABLED-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DISABLED-NEXT: s_mov_b32 s2, -1
+; GFX1064-DISABLED-NEXT: ; implicit-def: $vgpr0
+; GFX1064-DISABLED-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DISABLED-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064-DISABLED-NEXT: ; %bb.1:
+; GFX1064-DISABLED-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-DISABLED-NEXT: v_mov_b32_e32 v4, s6
+; GFX1064-DISABLED-NEXT: ds_add_rtn_f32 v0, v0, v4
+; GFX1064-DISABLED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DISABLED-NEXT: buffer_gl0_inv
+; GFX1064-DISABLED-NEXT: .LBB4_2:
+; GFX1064-DISABLED-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064-DISABLED-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-DISABLED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064-DISABLED-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1064-DISABLED-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DISABLED-NEXT: v_add_f32_e32 v0, s3, v0
+; GFX1064-DISABLED-NEXT: v_cndmask_b32_e64 v0, v0, s3, vcc
+; GFX1064-DISABLED-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064-DISABLED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DISABLED-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-DISABLED-NEXT: s_endpgm
+entry:
+ %lane = call i32 @llvm.amdgcn.workitem.id.x()
+ %conv = sitofp i32 %lane to float
+ %rmw = atomicrmw fadd ptr addrspace(3) @local_var32, float %conv syncscope("agent") acq_rel
+ store float %rmw, ptr addrspace(1) %out
+ ret void
+}
+
+; Test 6: divergent i32 add on GLOBAL memory -- should NOT be affected by threshold
+define amdgpu_kernel void @add_i32_varying_global(ptr addrspace(1) %out, ptr addrspace(1) %inout) {
+; GFX1032-THRESH5-LABEL: add_i32_varying_global:
+; GFX1032-THRESH5: ; %bb.0: ; %entry
+; GFX1032-THRESH5-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-THRESH5-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1032-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-THRESH5-NEXT: v_permlanex16_b32 v2, v1, -1, -1
+; GFX1032-THRESH5-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-THRESH5-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032-THRESH5-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1032-THRESH5-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032-THRESH5-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-THRESH5-NEXT: v_readlane_b32 s6, v1, 31
+; GFX1032-THRESH5-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032-THRESH5-NEXT: v_readlane_b32 s5, v1, 15
+; GFX1032-THRESH5-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032-THRESH5-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-THRESH5-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1032-THRESH5-NEXT: v_writelane_b32 v3, s5, 16
+; GFX1032-THRESH5-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032-THRESH5-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-THRESH5-NEXT: s_mov_b32 s4, s6
+; GFX1032-THRESH5-NEXT: s_mov_b32 s6, -1
+; GFX1032-THRESH5-NEXT: ; implicit-def: $vgpr0
+; GFX1032-THRESH5-NEXT: s_and_saveexec_b32 s8, vcc_lo
+; GFX1032-THRESH5-NEXT: s_cbranch_execz .LBB5_2
+; GFX1032-THRESH5-NEXT: ; %bb.1:
+; GFX1032-THRESH5-NEXT: v_mov_b32_e32 v0, s4
+; GFX1032-THRESH5-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-THRESH5-NEXT: s_mov_b32 s4, s2
+; GFX1032-THRESH5-NEXT: s_mov_b32 s5, s3
+; GFX1032-THRESH5-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc
+; GFX1032-THRESH5-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-THRESH5-NEXT: buffer_gl1_inv
+; GFX1032-THRESH5-NEXT: buffer_gl0_inv
+; GFX1032-THRESH5-NEXT: .LBB5_2:
+; GFX1032-THRESH5-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032-THRESH5-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX1032-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-THRESH5-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032-THRESH5-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-THRESH5-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032-THRESH5-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; GFX1032-THRESH5-NEXT: s_mov_b32 s2, s6
+; GFX1032-THRESH5-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-THRESH5-NEXT: s_endpgm
+;
+; GFX1064-THRESH5-LABEL: add_i32_varying_global:
+; GFX1064-THRESH5: ; %bb.0: ; %entry
+; GFX1064-THRESH5-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-THRESH5-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1064-THRESH5-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-THRESH5-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-THRESH5-NEXT: v_permlanex16_b32 v2, v1, -1, -1
+; GFX1064-THRESH5-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064-THRESH5-NEXT: v_readlane_b32 s2, v1, 31
+; GFX1064-THRESH5-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-THRESH5-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064-THRESH5-NEXT: v_readlane_b32 s6, v1, 15
+; GFX1064-THRESH5-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064-THRESH5-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-THRESH5-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064-THRESH5-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064-THRESH5-NEXT: v_readlane_b32 s7, v1, 31
+; GFX1064-THRESH5-NEXT: v_writelane_b32 v3, s6, 16
+; GFX1064-THRESH5-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064-THRESH5-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-THRESH5-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064-THRESH5-NEXT: v_readlane_b32 s8, v1, 47
+; GFX1064-THRESH5-NEXT: v_readlane_b32 s9, v1, 63
+; GFX1064-THRESH5-NEXT: v_writelane_b32 v3, s7, 32
+; GFX1064-THRESH5-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064-THRESH5-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-THRESH5-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX1064-THRESH5-NEXT: s_mov_b32 s4, s9
+; GFX1064-THRESH5-NEXT: v_writelane_b32 v3, s8, 48
+; GFX1064-THRESH5-NEXT: s_mov_b64 exec, s[6:7]
+; GFX1064-THRESH5-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-THRESH5-NEXT: s_mov_b32 s6, -1
+; GFX1064-THRESH5-NEXT: ; implicit-def: $vgpr0
+; GFX1064-THRESH5-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; GFX1064-THRESH5-NEXT: s_cbranch_execz .LBB5_2
+; GFX1064-THRESH5-NEXT: ; %bb.1:
+; GFX1064-THRESH5-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064-THRESH5-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-THRESH5-NEXT: s_mov_b32 s4, s2
+; GFX1064-THRESH5-NEXT: s_mov_b32 s5, s3
+; GFX1064-THRESH5-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc
+; GFX1064-THRESH5-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-THRESH5-NEXT: buffer_gl1_inv
+; GFX1064-THRESH5-NEXT: buffer_gl0_inv
+; GFX1064-THRESH5-NEXT: .LBB5_2:
+; GFX1064-THRESH5-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064-THRESH5-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX1064-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-THRESH5-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064-THRESH5-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-THRESH5-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064-THRESH5-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; GFX1064-THRESH5-NEXT: s_mov_b32 s2, s6
+; GFX1064-THRESH5-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-THRESH5-NEXT: s_endpgm
+;
+; GFX900-THRESH5-LABEL: add_i32_varying_global:
+; GFX900-THRESH5: ; %bb.0: ; %entry
+; GFX900-THRESH5-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX900-THRESH5-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX900-THRESH5-NEXT: v_mov_b32_e32 v1, 0
+; GFX900-THRESH5-NEXT: s_mov_b64 exec, s[4:5]
+; GFX900-THRESH5-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX900-THRESH5-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX900-THRESH5-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX900-THRESH5-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[4:5]
+; GFX900-THRESH5-NEXT: s_nop 1
+; GFX900-THRESH5-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX900-THRESH5-NEXT: s_nop 1
+; GFX900-THRESH5-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX900-THRESH5-NEXT: s_nop 1
+; GFX900-THRESH5-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX900-THRESH5-NEXT: s_nop 1
+; GFX900-THRESH5-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX900-THRESH5-NEXT: s_nop 1
+; GFX900-THRESH5-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX900-THRESH5-NEXT: s_nop 1
+; GFX900-THRESH5-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX900-THRESH5-NEXT: v_readlane_b32 s6, v2, 63
+; GFX900-THRESH5-NEXT: s_nop 0
+; GFX900-THRESH5-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX900-THRESH5-NEXT: s_mov_b64 exec, s[4:5]
+; GFX900-THRESH5-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX900-THRESH5-NEXT: ; implicit-def: $vgpr0
+; GFX900-THRESH5-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX900-THRESH5-NEXT: s_cbranch_execz .LBB5_2
+; GFX900-THRESH5-NEXT: ; %bb.1:
+; GFX900-THRESH5-NEXT: s_mov_b32 s11, 0xf000
+; GFX900-THRESH5-NEXT: s_mov_b32 s10, -1
+; GFX900-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-THRESH5-NEXT: s_mov_b32 s8, s2
+; GFX900-THRESH5-NEXT: s_mov_b32 s9, s3
+; GFX900-THRESH5-NEXT: v_mov_b32_e32 v0, s6
+; GFX900-THRESH5-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
+; GFX900-THRESH5-NEXT: s_waitcnt vmcnt(0)
+; GFX900-THRESH5-NEXT: buffer_wbinvl1_vol
+; GFX900-THRESH5-NEXT: .LBB5_2:
+; GFX900-THRESH5-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX900-THRESH5-NEXT: v_readfirstlane_b32 s4, v0
+; GFX900-THRESH5-NEXT: v_mov_b32_e32 v0, v1
+; GFX900-THRESH5-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-THRESH5-NEXT: s_mov_b32 s3, 0xf000
+; GFX900-THRESH5-NEXT: s_mov_b32 s2, -1
+; GFX900-THRESH5-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX900-THRESH5-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX900-THRESH5-NEXT: s_endpgm
+;
+; GFX1032-DISABLED-LABEL: add_i32_varying_global:
+; GFX1032-DISABLED: ; %bb.0: ; %entry
+; GFX1032-DISABLED-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DISABLED-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
+; GFX1032-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-DISABLED-NEXT: v_permlanex16_b32 v2, v1, -1, -1
+; GFX1032-DISABLED-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-DISABLED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032-DISABLED-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1032-DISABLED-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032-DISABLED-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-DISABLED-NEXT: v_readlane_b32 s6, v1, 31
+; GFX1032-DISABLED-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DISABLED-NEXT: v_readlane_b32 s5, v1, 15
+; GFX1032-DISABLED-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032-DISABLED-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DISABLED-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1032-DISABLED-NEXT: v_writelane_b32 v3, s5, 16
+; GFX1032-DISABLED-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032-DISABLED-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DISABLED-NEXT: s_mov_b32 s4, s6
+; GFX1032-DISABLED-NEXT: s_mov_b32 s6, -1
+; GFX1032-DISABLED-NEXT: ; implicit-def: $vgpr0
+; GFX1032-DISABLED-NEXT: s_and_saveexec_b32 s8, vcc_lo
+; GFX1032-DISABLED-NEXT: s_cbranch_execz .LBB5_2
+; GFX1032-DISABLED-NEXT: ; %bb.1:
+; GFX1032-DISABLED-NEXT: v_mov_b32_e32 v0, s4
+; GFX1032-DISABLED-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-DISABLED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DISABLED-NEXT: s_mov_b32 s4, s2
+; GFX1032-DISABLED-NEXT: s_mov_b32 s5, s3
+; GFX1032-DISABLED-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc
+; GFX1032-DISABLED-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DISABLED-NEXT: buffer_gl1_inv
+; GFX1032-DISABLED-NEXT: buffer_gl0_inv
+; GFX1032-DISABLED-NEXT: .LBB5_2:
+; GFX1032-DISABLED-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032-DISABLED-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX1032-DISABLED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DISABLED-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032-DISABLED-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DISABLED-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032-DISABLED-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; GFX1032-DISABLED-NEXT: s_mov_b32 s2, s6
+; GFX1032-DISABLED-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-DISABLED-NEXT: s_endpgm
+;
+; GFX1064-DISABLED-LABEL: add_i32_varying_global:
+; GFX1064-DISABLED: ; %bb.0: ; %entry
+; GFX1064-DISABLED-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DISABLED-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX1064-DISABLED-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DISABLED-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DISABLED-NEXT: v_permlanex16_b32 v2, v1, -1, -1
+; GFX1064-DISABLED-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064-DISABLED-NEXT: v_readlane_b32 s2, v1, 31
+; GFX1064-DISABLED-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-DISABLED-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064-DISABLED-NEXT: v_readlane_b32 s6, v1, 15
+; GFX1064-DISABLED-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DISABLED-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-DISABLED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064-DISABLED-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064-DISABLED-NEXT: v_readlane_b32 s7, v1, 31
+; GFX1064-DISABLED-NEXT: v_writelane_b32 v3, s6, 16
+; GFX1064-DISABLED-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064-DISABLED-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DISABLED-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064-DISABLED-NEXT: v_readlane_b32 s8, v1, 47
+; GFX1064-DISABLED-NEXT: v_readlane_b32 s9, v1, 63
+; GFX1064-DISABLED-NEXT: v_writelane_b32 v3, s7, 32
+; GFX1064-DISABLED-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064-DISABLED-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DISABLED-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX1064-DISABLED-NEXT: s_mov_b32 s4, s9
+; GFX1064-DISABLED-NEXT: v_writelane_b32 v3, s8, 48
+; GFX1064-DISABLED-NEXT: s_mov_b64 exec, s[6:7]
+; GFX1064-DISABLED-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DISABLED-NEXT: s_mov_b32 s6, -1
+; GFX1064-DISABLED-NEXT: ; implicit-def: $vgpr0
+; GFX1064-DISABLED-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; GFX1064-DISABLED-NEXT: s_cbranch_execz .LBB5_2
+; GFX1064-DISABLED-NEXT: ; %bb.1:
+; GFX1064-DISABLED-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064-DISABLED-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-DISABLED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DISABLED-NEXT: s_mov_b32 s4, s2
+; GFX1064-DISABLED-NEXT: s_mov_b32 s5, s3
+; GFX1064-DISABLED-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc
+; GFX1064-DISABLED-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DISABLED-NEXT: buffer_gl1_inv
+; GFX1064-DISABLED-NEXT: buffer_gl0_inv
+; GFX1064-DISABLED-NEXT: .LBB5_2:
+; GFX1064-DISABLED-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064-DISABLED-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX1064-DISABLED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DISABLED-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064-DISABLED-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-DISABLED-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064-DISABLED-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; GFX1064-DISABLED-NEXT: s_mov_b32 s2, s6
+; GFX1064-DISABLED-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-DISABLED-NEXT: s_endpgm
+entry:
+ %lane = call i32 @llvm.amdgcn.workitem.id.x()
+ %old = atomicrmw add ptr addrspace(1) %inout, i32 %lane acq_rel
+ store i32 %old, ptr addrspace(1) %out
+ ret void
+}
>From fe5b5529af1b8665d052a9405dce92f8008500c5 Mon Sep 17 00:00:00 2001
From: YaFan <YaFan.Tao at amd.com>
Date: Fri, 20 Mar 2026 15:03:21 +0800
Subject: [PATCH 2/6] Remove product name from comment per review feedback
---
llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index f3cde05cee673..2aea1e59d26de 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -46,8 +46,7 @@ using namespace llvm::AMDGPU;
//
// Only integer LDS atomics are affected because floating-point path is
// untested. The default is 0 (always use DPP) because unconditionally enabling
-// the threshold can regress DPP-friendly workloads by ~4% (tested on RX
-// 7900XT).
+// the threshold can regress DPP-friendly workloads by ~4%.
static cl::opt<unsigned> AMDGPUAtomicOptimizerDPPLdsThreshold(
"amdgpu-atomic-optimizer-dpp-lds-threshold",
cl::desc("Use DPP scan for integer LDS atomics only when active lanes > "
>From d6f5399675de49a6155a2dbb54ab6e4a0e63f170 Mon Sep 17 00:00:00 2001
From: YaFan <YaFan.Tao at amd.com>
Date: Mon, 23 Mar 2026 15:46:15 +0800
Subject: [PATCH 3/6] Move RUN lines to top of test file per review feedback
---
.../atomic_optimizations_dpp_lds_threshold.ll | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_dpp_lds_threshold.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_dpp_lds_threshold.ll
index 0ef74d7ca2c02..0d52229a34edf 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_dpp_lds_threshold.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_dpp_lds_threshold.ll
@@ -1,21 +1,21 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
;
-; Test the -amdgpu-atomic-optimizer-dpp-lds-threshold option which controls
-; dynamic DPP vs no-opt branching for integer LDS atomics.
-;
-; Threshold=5: use DPP only when active lanes > 5, otherwise each lane does
-; its own atomic.
-; Threshold=32: on wave32 this disables DPP entirely (>= wavefront size).
-; Threshold=64: on wave64 this disables DPP entirely (>= wavefront size).
-
; --- Threshold=5 tests (dynamic branch expected) ---
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -amdgpu-atomic-optimizer-dpp-lds-threshold=5 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-THRESH5 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -amdgpu-atomic-optimizer-dpp-lds-threshold=5 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-THRESH5 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -amdgpu-atomic-optimizer-dpp-lds-threshold=5 < %s | FileCheck -enable-var-scope -check-prefixes=GFX900-THRESH5 %s
-
+;
; --- Threshold >= wavefront size tests (DPP fully disabled for int LDS) ---
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -amdgpu-atomic-optimizer-dpp-lds-threshold=32 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DISABLED %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -amdgpu-atomic-optimizer-dpp-lds-threshold=64 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DISABLED %s
+;
+; Test the -amdgpu-atomic-optimizer-dpp-lds-threshold option which controls
+; dynamic DPP vs no-opt branching for integer LDS atomics.
+;
+; Threshold=5: use DPP only when active lanes > 5, otherwise each lane does
+; its own atomic.
+; Threshold=32: on wave32 this disables DPP entirely (>= wavefront size).
+; Threshold=64: on wave64 this disables DPP entirely (>= wavefront size).
declare i32 @llvm.amdgcn.workitem.id.x()
>From 4c6ea98afb5b95ff6ede96f338e449cacc069344 Mon Sep 17 00:00:00 2001
From: YaFan <YaFan.Tao at amd.com>
Date: Mon, 23 Mar 2026 16:32:30 +0800
Subject: [PATCH 4/6] Support per-function attribute for DPP LDS threshold and
pass threshold as parameter
---
.../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 23 ++++++++++++++-----
1 file changed, 17 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 2aea1e59d26de..556c0c1a99fe4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -118,7 +118,8 @@ class AMDGPUAtomicOptimizerImpl
Value *optimizeAtomicWithDynamicThreshold(IRBuilder<> &B, Instruction &I,
AtomicRMWInst::BinOp Op,
- unsigned ValIdx) const;
+ unsigned ValIdx,
+ unsigned Threshold) const;
public:
AMDGPUAtomicOptimizerImpl() = delete;
@@ -756,12 +757,23 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// Decide which implementation path to use.
// For DPP strategy with integer, divergent-value LDS atomics, when a
// threshold is set, use dynamic branching based on active lane count.
- const unsigned Threshold = AMDGPUAtomicOptimizerDPPLdsThreshold;
+ // The command-line option takes priority; otherwise fall back to a
+ // per-function attribute.
+ unsigned Threshold = 0;
+ if (AMDGPUAtomicOptimizerDPPLdsThreshold.getNumOccurrences() > 0) {
+ Threshold = AMDGPUAtomicOptimizerDPPLdsThreshold;
+ } else {
+ int AttrVal = I.getFunction()->getFnAttributeAsParsedInteger(
+ "amdgpu-atomic-optimizer-dpp-lds-threshold", 0);
+ if (AttrVal > 0)
+ Threshold = static_cast<unsigned>(AttrVal);
+ }
+
Value *Result = nullptr;
if (IsLDS && ValDivergent && ScanImpl == ScanOptions::DPP &&
!AtomicRMWInst::isFPOperation(Op) && Threshold > 0 &&
Threshold < ST.getWavefrontSize()) {
- Result = optimizeAtomicWithDynamicThreshold(B, I, Op, ValIdx);
+ Result = optimizeAtomicWithDynamicThreshold(B, I, Op, ValIdx, Threshold);
} else {
Result = optimizeAtomicImpl(B, I, Op, ValIdx, ValDivergent);
}
@@ -1078,8 +1090,8 @@ Value *AMDGPUAtomicOptimizerImpl::optimizeAtomicImpl(IRBuilder<> &B,
// MergeBB:
// PHI merges results from both paths
Value *AMDGPUAtomicOptimizerImpl::optimizeAtomicWithDynamicThreshold(
- IRBuilder<> &B, Instruction &I, AtomicRMWInst::BinOp Op,
- unsigned ValIdx) const {
+ IRBuilder<> &B, Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
+ unsigned Threshold) const {
Type *const Ty = I.getType();
const unsigned TyBitWidth = DL.getTypeSizeInBits(Ty);
const bool NeedResult = !I.use_empty();
@@ -1095,7 +1107,6 @@ Value *AMDGPUAtomicOptimizerImpl::optimizeAtomicWithDynamicThreshold(
B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), B.getInt32Ty(), false);
// Branch: if active lanes > threshold, use DPP; otherwise no-opt.
- const unsigned Threshold = AMDGPUAtomicOptimizerDPPLdsThreshold;
Value *const ThresholdCond = B.CreateICmpUGT(Ctpop, B.getInt32(Threshold));
BasicBlock *const EntryBB = I.getParent();
>From ae05d5f71927db9da07b1db59669d420bc5d3526 Mon Sep 17 00:00:00 2001
From: YaFan <YaFan.Tao at amd.com>
Date: Mon, 23 Mar 2026 16:43:38 +0800
Subject: [PATCH 5/6] Invert NeedResult check to early return style
---
llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 556c0c1a99fe4..5ed7f2cde7ccd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -1178,14 +1178,14 @@ Value *AMDGPUAtomicOptimizerImpl::optimizeAtomicWithDynamicThreshold(
Value *NoOptResult = NeedResult ? static_cast<Value *>(NoOptNewI) : nullptr;
// MergeBB: PHI-merge results from DppBB and NoOptBB.
- if (NeedResult) {
- B.SetInsertPoint(MergeBB, MergeBB->getFirstNonPHIIt());
- PHINode *const MergePHI = B.CreatePHI(Ty, 2);
- MergePHI->addIncoming(DppResult, ThenTerm->getParent());
- MergePHI->addIncoming(NoOptResult, NoOptBB);
- return MergePHI;
- }
- return nullptr;
+ if (!NeedResult)
+ return nullptr;
+
+ B.SetInsertPoint(MergeBB, MergeBB->getFirstNonPHIIt());
+ PHINode *const MergePHI = B.CreatePHI(Ty, 2);
+ MergePHI->addIncoming(DppResult, ThenTerm->getParent());
+ MergePHI->addIncoming(NoOptResult, NoOptBB);
+ return MergePHI;
}
INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE,
>From f64c7dacf70aba4427f96d4e48f76471cdae9581 Mon Sep 17 00:00:00 2001
From: YaFan <YaFan.Tao at amd.com>
Date: Fri, 27 Mar 2026 16:47:52 +0800
Subject: [PATCH 6/6] [AMDGPU] Refactor optimizeAtomicWithDynamicThreshold to
reuse optimizeAtomicImpl
Instead of duplicating the DPP scan/reduction logic, delegate the actual
atomic optimization to optimizeAtomicImpl. Use SplitBlockAndInsertIfThenElse
to build the outer threshold CFG (OptBB/NoOptBB/TailBB), move I into OptBB,
and let optimizeAtomicImpl handle the rest.
NFC.
---
.../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 135 ++++++------------
1 file changed, 40 insertions(+), 95 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 5ed7f2cde7ccd..a9caeab069c7c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -1060,131 +1060,76 @@ Value *AMDGPUAtomicOptimizerImpl::optimizeAtomicImpl(IRBuilder<> &B,
}
// Generate a dynamic branch based on the active lane count so that the
-// DPP scan path is only used when enough lanes are active to amortise its
-// overhead. When the active count is at or below the threshold, each lane
-// independently issues its own atomic, which is cheaper for small groups.
+// optimized scan path is only used when enough lanes are active to amortise
+// its overhead. When the active count is at or below the threshold, each
+// lane independently issues its own atomic, which is cheaper for small
+// groups.
//
-// CFG produced:
+// This function builds the outer threshold CFG and delegates the actual
+// atomic optimization to optimizeAtomicImpl.
+//
+// CFG produced (the OptimizedPath sub-CFG is built by optimizeAtomicImpl):
//
// EntryBB:
// Ballot / Ctpop -> ActiveCount
-// cmp ActiveCount >= Threshold
-// br -> DppBB or NoOptBB
-//
-// DppBB:
-// DPP scan / reduction, single-lane gate
-// br -> SingleLaneBB or DppExitBB
+// cmp ActiveCount > Threshold
+// br -> OptBB or NoOptBB
//
-// SingleLaneBB:
-// atomic with reduced value
-// br -> DppExitBB
-//
-// DppExitBB:
-// readfirstlane + per-lane result
-// br -> MergeBB
+// OptBB:
+// [optimizeAtomicImpl rewrites I here, creating its own internal CFG]
+// br -> TailBB
//
// NoOptBB:
-// original atomic (unoptimized)
-// br -> MergeBB
+// original atomic (unoptimized, each lane issues its own)
+// br -> TailBB
//
-// MergeBB:
+// TailBB:
// PHI merges results from both paths
Value *AMDGPUAtomicOptimizerImpl::optimizeAtomicWithDynamicThreshold(
IRBuilder<> &B, Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
unsigned Threshold) const {
Type *const Ty = I.getType();
- const unsigned TyBitWidth = DL.getTypeSizeInBits(Ty);
const bool NeedResult = !I.use_empty();
+ // Count active lanes and build the threshold condition.
Type *const WaveTy = B.getIntNTy(ST.getWavefrontSize());
CallInst *const Ballot =
B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
-
- Value *Mbcnt = buildMbcnt(B, Ballot);
-
- // Count active lanes.
Value *const Ctpop = B.CreateIntCast(
B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), B.getInt32Ty(), false);
-
- // Branch: if active lanes > threshold, use DPP; otherwise no-opt.
Value *const ThresholdCond = B.CreateICmpUGT(Ctpop, B.getInt32(Threshold));
- BasicBlock *const EntryBB = I.getParent();
- Instruction *const SplitPt = &I;
- Instruction *const ThenTerm = SplitBlockAndInsertIfThen(
- ThresholdCond, SplitPt, false, nullptr, &DTU, nullptr);
- BasicBlock *const DppBB = ThenTerm->getParent();
- BasicBlock *const MergeBB = I.getParent();
-
- // Also create NoOptBB between DppBB and MergeBB.
- BasicBlock *const NoOptBB = BasicBlock::Create(
- I.getContext(), "atomicrmw.no_opt", I.getFunction(), MergeBB);
- // Fix CFG: EntryBB's conditional branch false edge -> NoOptBB -> MergeBB.
- BranchInst *const EntryBr = cast<BranchInst>(EntryBB->getTerminator());
- EntryBr->setSuccessor(1, NoOptBB);
- IRBuilder<> NoOptBuilder(NoOptBB);
- NoOptBuilder.CreateBr(MergeBB);
- DTU.applyUpdates({{DominatorTree::Insert, EntryBB, NoOptBB},
- {DominatorTree::Delete, EntryBB, MergeBB},
- {DominatorTree::Insert, NoOptBB, MergeBB}});
-
- // DppBB: perform DPP scan/reduction and single-lane atomic.
- B.SetInsertPoint(ThenTerm);
-
- AtomicRMWInst::BinOp ScanOp = Op;
- if (Op == AtomicRMWInst::Sub)
- ScanOp = AtomicRMWInst::Add;
-
- Value *Identity = getIdentityValueForAtomicOp(Ty, ScanOp);
- Value *V = I.getOperand(ValIdx);
-
- auto [NewV, ExclScan] =
- buildDPPScanAndReduce(B, ScanOp, Ty, V, Identity, NeedResult);
+ // Split at I into: EntryBB -> OptBB (then) / NoOptBB (else) -> TailBB.
+ Instruction *ThenTerm = nullptr, *ElseTerm = nullptr;
+ SplitBlockAndInsertIfThenElse(ThresholdCond, &I, &ThenTerm, &ElseTerm,
+ nullptr, &DTU, nullptr);
+ BasicBlock *const NoOptBB = ElseTerm->getParent();
+ BasicBlock *const TailBB = I.getParent();
- Value *const DppIsFirst = B.CreateICmpEQ(Mbcnt, B.getInt32(0));
-
- BasicBlock *const DppOrigBB = B.GetInsertBlock();
- Instruction *const DppSingleTerm = SplitBlockAndInsertIfThen(
- DppIsFirst, ThenTerm, false, nullptr, &DTU, nullptr);
- B.SetInsertPoint(DppSingleTerm);
- Instruction *const DppNewI = I.clone();
- B.Insert(DppNewI);
- DppNewI->setOperand(ValIdx, NewV);
-
- Value *DppResult = nullptr;
- if (NeedResult) {
- B.SetInsertPoint(ThenTerm);
- PHINode *const DppPHI = B.CreatePHI(Ty, 2);
- DppPHI->addIncoming(PoisonValue::get(Ty), DppOrigBB);
- DppPHI->addIncoming(DppNewI, DppSingleTerm->getParent());
-
- Value *ReadlaneVal = DppPHI;
- if (TyBitWidth < 32)
- ReadlaneVal = B.CreateZExt(DppPHI, B.getInt32Ty());
- Value *BroadcastI = B.CreateIntrinsic(
- ReadlaneVal->getType(), Intrinsic::amdgcn_readfirstlane, ReadlaneVal);
- if (TyBitWidth < 32)
- BroadcastI = B.CreateTrunc(BroadcastI, Ty);
+ // NoOptBB: each lane independently issues its own atomic (unoptimized).
+ Instruction *const NoOptNewI = I.clone();
+ NoOptNewI->insertBefore(ElseTerm->getIterator());
- Value *LaneOffset =
- B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
- DppResult = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
- }
+ // Move I into OptBB so that optimizeAtomicImpl rewrites only the optimized
+ // path. I's uses are temporarily non-dominating; they will be fixed when
+ // optimizeAtomic does replaceAllUsesWith after we return.
+ I.moveBefore(ThenTerm->getIterator());
+ B.SetInsertPoint(&I);
- // NoOptBB: each lane independently issues its own atomic.
- NoOptBuilder.SetInsertPoint(NoOptBB->getTerminator());
- Instruction *const NoOptNewI = I.clone();
- NoOptBuilder.Insert(NoOptNewI);
- Value *NoOptResult = NeedResult ? static_cast<Value *>(NoOptNewI) : nullptr;
+ // Delegate the full scan/reduction + single-lane atomic + result
+ // reconstruction to optimizeAtomicImpl.
+ Value *OptResult =
+ optimizeAtomicImpl(B, I, Op, ValIdx, /*ValDivergent=*/true);
- // MergeBB: PHI-merge results from DppBB and NoOptBB.
if (!NeedResult)
return nullptr;
- B.SetInsertPoint(MergeBB, MergeBB->getFirstNonPHIIt());
+ // After optimizeAtomicImpl, I sits in the exit block of the optimized
+ // sub-CFG. Merge the optimized and no-opt results in TailBB.
+ B.SetInsertPoint(TailBB, TailBB->getFirstNonPHIIt());
PHINode *const MergePHI = B.CreatePHI(Ty, 2);
- MergePHI->addIncoming(DppResult, ThenTerm->getParent());
- MergePHI->addIncoming(NoOptResult, NoOptBB);
+ MergePHI->addIncoming(OptResult, I.getParent());
+ MergePHI->addIncoming(NoOptNewI, NoOptBB);
return MergePHI;
}
More information about the llvm-commits
mailing list