[llvm] r366235 - [AMDGPU] Optimize atomic max/min
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 16 10:44:54 PDT 2019
Author: foad
Date: Tue Jul 16 10:44:54 2019
New Revision: 366235
URL: http://llvm.org/viewvc/llvm-project?rev=366235&view=rev
Log:
[AMDGPU] Optimize atomic max/min
Summary:
Extend the atomic optimizer to handle signed and unsigned max and min
operations, as well as add and subtract.
Reviewers: arsenm, sheredom, critson, rampitec
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, jfb, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D64328
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp?rev=366235&r1=366234&r2=366235&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp Tue Jul 16 10:44:54 2019
@@ -40,7 +40,7 @@ enum DPP_CTRL {
struct ReplacementInfo {
Instruction *I;
- Instruction::BinaryOps Op;
+ AtomicRMWInst::BinOp Op;
unsigned ValIdx;
bool ValDivergent;
};
@@ -55,8 +55,8 @@ private:
bool HasDPP;
bool IsPixelShader;
- void optimizeAtomic(Instruction &I, Instruction::BinaryOps Op,
- unsigned ValIdx, bool ValDivergent) const;
+ void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
+ bool ValDivergent) const;
public:
static char ID;
@@ -120,16 +120,17 @@ void AMDGPUAtomicOptimizer::visitAtomicR
break;
}
- Instruction::BinaryOps Op;
+ AtomicRMWInst::BinOp Op = I.getOperation();
- switch (I.getOperation()) {
+ switch (Op) {
default:
return;
case AtomicRMWInst::Add:
- Op = Instruction::Add;
- break;
case AtomicRMWInst::Sub:
- Op = Instruction::Sub;
+ case AtomicRMWInst::Max:
+ case AtomicRMWInst::Min:
+ case AtomicRMWInst::UMax:
+ case AtomicRMWInst::UMin:
break;
}
@@ -161,7 +162,7 @@ void AMDGPUAtomicOptimizer::visitAtomicR
}
void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
- Instruction::BinaryOps Op;
+ AtomicRMWInst::BinOp Op;
switch (I.getIntrinsicID()) {
default:
@@ -169,12 +170,32 @@ void AMDGPUAtomicOptimizer::visitIntrins
case Intrinsic::amdgcn_buffer_atomic_add:
case Intrinsic::amdgcn_struct_buffer_atomic_add:
case Intrinsic::amdgcn_raw_buffer_atomic_add:
- Op = Instruction::Add;
+ Op = AtomicRMWInst::Add;
break;
case Intrinsic::amdgcn_buffer_atomic_sub:
case Intrinsic::amdgcn_struct_buffer_atomic_sub:
case Intrinsic::amdgcn_raw_buffer_atomic_sub:
- Op = Instruction::Sub;
+ Op = AtomicRMWInst::Sub;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_smin:
+ case Intrinsic::amdgcn_struct_buffer_atomic_smin:
+ case Intrinsic::amdgcn_raw_buffer_atomic_smin:
+ Op = AtomicRMWInst::Min;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_umin:
+ case Intrinsic::amdgcn_struct_buffer_atomic_umin:
+ case Intrinsic::amdgcn_raw_buffer_atomic_umin:
+ Op = AtomicRMWInst::UMin;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_smax:
+ case Intrinsic::amdgcn_struct_buffer_atomic_smax:
+ case Intrinsic::amdgcn_raw_buffer_atomic_smax:
+ Op = AtomicRMWInst::Max;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_umax:
+ case Intrinsic::amdgcn_struct_buffer_atomic_umax:
+ case Intrinsic::amdgcn_raw_buffer_atomic_umax:
+ Op = AtomicRMWInst::UMax;
break;
}
@@ -206,8 +227,57 @@ void AMDGPUAtomicOptimizer::visitIntrins
ToReplace.push_back(Info);
}
+// Use the builder to create the non-atomic counterpart of the specified
+// atomicrmw binary op.
+static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
+ Value *LHS, Value *RHS) {
+ CmpInst::Predicate Pred;
+
+ switch (Op) {
+ default:
+ llvm_unreachable("Unhandled atomic op");
+ case AtomicRMWInst::Add:
+ return B.CreateBinOp(Instruction::Add, LHS, RHS);
+ case AtomicRMWInst::Sub:
+ return B.CreateBinOp(Instruction::Sub, LHS, RHS);
+
+ case AtomicRMWInst::Max:
+ Pred = CmpInst::ICMP_SGT;
+ break;
+ case AtomicRMWInst::Min:
+ Pred = CmpInst::ICMP_SLT;
+ break;
+ case AtomicRMWInst::UMax:
+ Pred = CmpInst::ICMP_UGT;
+ break;
+ case AtomicRMWInst::UMin:
+ Pred = CmpInst::ICMP_ULT;
+ break;
+ }
+ Value *Cond = B.CreateICmp(Pred, LHS, RHS);
+ return B.CreateSelect(Cond, LHS, RHS);
+}
+
+static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,
+ unsigned BitWidth) {
+ switch (Op) {
+ default:
+ llvm_unreachable("Unhandled atomic op");
+ case AtomicRMWInst::Add:
+ case AtomicRMWInst::Sub:
+ case AtomicRMWInst::UMax:
+ return APInt::getMinValue(BitWidth);
+ case AtomicRMWInst::UMin:
+ return APInt::getMaxValue(BitWidth);
+ case AtomicRMWInst::Max:
+ return APInt::getSignedMinValue(BitWidth);
+ case AtomicRMWInst::Min:
+ return APInt::getSignedMaxValue(BitWidth);
+ }
+}
+
void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
- Instruction::BinaryOps Op,
+ AtomicRMWInst::BinOp Op,
unsigned ValIdx,
bool ValDivergent) const {
// Start building just before the instruction.
@@ -266,16 +336,16 @@ void AMDGPUAtomicOptimizer::optimizeAtom
Value *const MbcntCast = B.CreateIntCast(Mbcnt, Ty, false);
- Value *LaneOffset = nullptr;
+ Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth));
+
+ Value *ExclScan = nullptr;
Value *NewV = nullptr;
// If we have a divergent value in each lane, we need to combine the value
// using DPP.
if (ValDivergent) {
- Value *const Identity = B.getIntN(TyBitWidth, 0);
-
- // First we need to set all inactive invocations to 0, so that they can
- // correctly contribute to the final result.
+ // First we need to set all inactive invocations to the identity value, so
+ // that they can correctly contribute to the final result.
CallInst *const SetInactive =
B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
@@ -283,7 +353,7 @@ void AMDGPUAtomicOptimizer::optimizeAtom
B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Ty,
{Identity, SetInactive, B.getInt32(DPP_WF_SR1),
B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
- NewV = FirstDPP;
+ ExclScan = FirstDPP;
const unsigned Iters = 7;
const unsigned DPPCtrl[Iters] = {
@@ -295,21 +365,20 @@ void AMDGPUAtomicOptimizer::optimizeAtom
// This loop performs an exclusive scan across the wavefront, with all lanes
// active (by using the WWM intrinsic).
for (unsigned Idx = 0; Idx < Iters; Idx++) {
- Value *const UpdateValue = Idx < 3 ? FirstDPP : NewV;
+ Value *const UpdateValue = Idx < 3 ? FirstDPP : ExclScan;
CallInst *const DPP = B.CreateIntrinsic(
Intrinsic::amdgcn_update_dpp, Ty,
{Identity, UpdateValue, B.getInt32(DPPCtrl[Idx]),
B.getInt32(RowMask[Idx]), B.getInt32(BankMask[Idx]), B.getFalse()});
- NewV = B.CreateBinOp(Op, NewV, DPP);
+ ExclScan = buildNonAtomicBinOp(B, Op, ExclScan, DPP);
}
- LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
- NewV = B.CreateBinOp(Op, SetInactive, NewV);
+ NewV = buildNonAtomicBinOp(B, Op, SetInactive, ExclScan);
// Read the value from the last lane, which has accumlated the values of
- // each active lane in the wavefront. This will be our new value with which
- // we will provide to the atomic operation.
+ // each active lane in the wavefront. This will be our new value which we
+ // will provide to the atomic operation.
if (TyBitWidth == 64) {
Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());
Value *const ExtractHi =
@@ -324,9 +393,8 @@ void AMDGPUAtomicOptimizer::optimizeAtom
B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1));
NewV = B.CreateBitCast(Insert, Ty);
} else if (TyBitWidth == 32) {
- CallInst *const ReadLane = B.CreateIntrinsic(Intrinsic::amdgcn_readlane,
- {}, {NewV, B.getInt32(63)});
- NewV = ReadLane;
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
+ {NewV, B.getInt32(63)});
} else {
llvm_unreachable("Unhandled atomic bit width");
}
@@ -334,14 +402,32 @@ void AMDGPUAtomicOptimizer::optimizeAtom
// Finally mark the readlanes in the WWM section.
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
} else {
- // Get the total number of active lanes we have by using popcount.
- Instruction *const Ctpop = B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot);
- Value *const CtpopCast = B.CreateIntCast(Ctpop, Ty, false);
-
- // Calculate the new value we will be contributing to the atomic operation
- // for the entire wavefront.
- NewV = B.CreateMul(V, CtpopCast);
- LaneOffset = B.CreateMul(V, MbcntCast);
+ switch (Op) {
+ default:
+ llvm_unreachable("Unhandled atomic op");
+
+ case AtomicRMWInst::Add:
+ case AtomicRMWInst::Sub: {
+ // Get the total number of active lanes we have by using popcount.
+ Instruction *const Ctpop =
+ B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot);
+ Value *const CtpopCast = B.CreateIntCast(Ctpop, Ty, false);
+
+ // Calculate the new value we will be contributing to the atomic operation
+ // for the entire wavefront.
+ NewV = B.CreateMul(V, CtpopCast);
+ break;
+ }
+
+ case AtomicRMWInst::Max:
+ case AtomicRMWInst::Min:
+ case AtomicRMWInst::UMax:
+ case AtomicRMWInst::UMin:
+ // Max/min with a uniform value is idempotent: doing the atomic operation
+ // multiple times has the same effect as doing it once.
+ NewV = V;
+ break;
+ }
}
// We only want a single lane to enter our new control flow, and we do this
@@ -407,7 +493,26 @@ void AMDGPUAtomicOptimizer::optimizeAtom
// get our individual lane's slice into the result. We use the lane offset we
// previously calculated combined with the atomic result value we got from the
// first lane, to get our lane's index into the atomic result.
- Value *const Result = B.CreateBinOp(Op, BroadcastI, LaneOffset);
+ Value *LaneOffset = nullptr;
+ if (ValDivergent) {
+ LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan);
+ } else {
+ switch (Op) {
+ default:
+ llvm_unreachable("Unhandled atomic op");
+ case AtomicRMWInst::Add:
+ case AtomicRMWInst::Sub:
+ LaneOffset = B.CreateMul(V, MbcntCast);
+ break;
+ case AtomicRMWInst::Max:
+ case AtomicRMWInst::Min:
+ case AtomicRMWInst::UMax:
+ case AtomicRMWInst::UMin:
+ LaneOffset = B.CreateSelect(Cond, Identity, V);
+ break;
+ }
+ }
+ Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
if (IsPixelShader) {
// Need a final PHI to reconverge to above the helper lane branch mask.
Modified: llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll?rev=366235&r1=366234&r2=366235&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll Tue Jul 16 10:44:54 2019
@@ -194,3 +194,111 @@ entry:
store i64 %old, i64 addrspace(1)* %out
ret void
}
+
+; GCN-LABEL: max_i32_varying:
+; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
+; GFX8MORE: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
+define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
+entry:
+ %lane = call i32 @llvm.amdgcn.workitem.id.x()
+ %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel
+ store i32 %old, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: max_i64_constant:
+; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
+; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
+; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5
+; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0
+; GCN: ds_max_rtn_i64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
+define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
+entry:
+ %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel
+ store i64 %old, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: min_i32_varying:
+; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
+; GFX8MORE: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
+define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
+entry:
+ %lane = call i32 @llvm.amdgcn.workitem.id.x()
+ %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel
+ store i32 %old, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: min_i64_constant:
+; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
+; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
+; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5
+; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0
+; GCN: ds_min_rtn_i64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
+define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
+entry:
+ %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel
+ store i64 %old, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: umax_i32_varying:
+; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
+; GFX8MORE: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
+define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
+entry:
+ %lane = call i32 @llvm.amdgcn.workitem.id.x()
+ %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel
+ store i32 %old, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: umax_i64_constant:
+; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
+; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
+; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5
+; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0
+; GCN: ds_max_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
+define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
+entry:
+ %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel
+ store i64 %old, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: umin_i32_varying:
+; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
+; GFX8MORE: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
+define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
+entry:
+ %lane = call i32 @llvm.amdgcn.workitem.id.x()
+ %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel
+ store i32 %old, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: umin_i64_constant:
+; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
+; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
+; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5
+; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0
+; GCN: ds_min_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
+define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
+entry:
+ %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel
+ store i64 %old, i64 addrspace(1)* %out
+ ret void
+}
More information about the llvm-commits
mailing list