[llvm] r366323 - [AMDGPU] Optimize atomic AND/OR/XOR
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 17 06:40:04 PDT 2019
Author: foad
Date: Wed Jul 17 06:40:03 2019
New Revision: 366323
URL: http://llvm.org/viewvc/llvm-project?rev=366323&view=rev
Log:
[AMDGPU] Optimize atomic AND/OR/XOR
Summary: Extend the atomic optimizer to handle AND, OR and XOR.
Reviewers: arsenm, sheredom
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, jfb, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D64809
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp?rev=366323&r1=366322&r2=366323&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp Wed Jul 17 06:40:03 2019
@@ -127,6 +127,9 @@ void AMDGPUAtomicOptimizer::visitAtomicR
return;
case AtomicRMWInst::Add:
case AtomicRMWInst::Sub:
+ case AtomicRMWInst::And:
+ case AtomicRMWInst::Or:
+ case AtomicRMWInst::Xor:
case AtomicRMWInst::Max:
case AtomicRMWInst::Min:
case AtomicRMWInst::UMax:
@@ -177,6 +180,21 @@ void AMDGPUAtomicOptimizer::visitIntrins
case Intrinsic::amdgcn_raw_buffer_atomic_sub:
Op = AtomicRMWInst::Sub;
break;
+ case Intrinsic::amdgcn_buffer_atomic_and:
+ case Intrinsic::amdgcn_struct_buffer_atomic_and:
+ case Intrinsic::amdgcn_raw_buffer_atomic_and:
+ Op = AtomicRMWInst::And;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_or:
+ case Intrinsic::amdgcn_struct_buffer_atomic_or:
+ case Intrinsic::amdgcn_raw_buffer_atomic_or:
+ Op = AtomicRMWInst::Or;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_xor:
+ case Intrinsic::amdgcn_struct_buffer_atomic_xor:
+ case Intrinsic::amdgcn_raw_buffer_atomic_xor:
+ Op = AtomicRMWInst::Xor;
+ break;
case Intrinsic::amdgcn_buffer_atomic_smin:
case Intrinsic::amdgcn_struct_buffer_atomic_smin:
case Intrinsic::amdgcn_raw_buffer_atomic_smin:
@@ -240,6 +258,12 @@ static Value *buildNonAtomicBinOp(IRBuil
return B.CreateBinOp(Instruction::Add, LHS, RHS);
case AtomicRMWInst::Sub:
return B.CreateBinOp(Instruction::Sub, LHS, RHS);
+ case AtomicRMWInst::And:
+ return B.CreateBinOp(Instruction::And, LHS, RHS);
+ case AtomicRMWInst::Or:
+ return B.CreateBinOp(Instruction::Or, LHS, RHS);
+ case AtomicRMWInst::Xor:
+ return B.CreateBinOp(Instruction::Xor, LHS, RHS);
case AtomicRMWInst::Max:
Pred = CmpInst::ICMP_SGT;
@@ -265,8 +289,11 @@ static APInt getIdentityValueForAtomicOp
llvm_unreachable("Unhandled atomic op");
case AtomicRMWInst::Add:
case AtomicRMWInst::Sub:
+ case AtomicRMWInst::Or:
+ case AtomicRMWInst::Xor:
case AtomicRMWInst::UMax:
return APInt::getMinValue(BitWidth);
+ case AtomicRMWInst::And:
case AtomicRMWInst::UMin:
return APInt::getMaxValue(BitWidth);
case AtomicRMWInst::Max:
@@ -331,10 +358,10 @@ void AMDGPUAtomicOptimizer::optimizeAtom
Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
CallInst *const PartialMbcnt = B.CreateIntrinsic(
Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)});
- CallInst *const Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {},
- {ExtractHi, PartialMbcnt});
-
- Value *const MbcntCast = B.CreateIntCast(Mbcnt, Ty, false);
+ Value *const Mbcnt =
+ B.CreateIntCast(B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {},
+ {ExtractHi, PartialMbcnt}),
+ Ty, false);
Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth));
@@ -408,32 +435,39 @@ void AMDGPUAtomicOptimizer::optimizeAtom
case AtomicRMWInst::Add:
case AtomicRMWInst::Sub: {
- // Get the total number of active lanes we have by using popcount.
- Instruction *const Ctpop =
- B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot);
- Value *const CtpopCast = B.CreateIntCast(Ctpop, Ty, false);
-
- // Calculate the new value we will be contributing to the atomic operation
- // for the entire wavefront.
- NewV = B.CreateMul(V, CtpopCast);
+ // The new value we will be contributing to the atomic operation is the
+ // old value times the number of active lanes.
+ Value *const Ctpop = B.CreateIntCast(
+ B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
+ NewV = B.CreateMul(V, Ctpop);
break;
}
+ case AtomicRMWInst::And:
+ case AtomicRMWInst::Or:
case AtomicRMWInst::Max:
case AtomicRMWInst::Min:
case AtomicRMWInst::UMax:
case AtomicRMWInst::UMin:
- // Max/min with a uniform value is idempotent: doing the atomic operation
- // multiple times has the same effect as doing it once.
+ // These operations with a uniform value are idempotent: doing the atomic
+ // operation multiple times has the same effect as doing it once.
NewV = V;
break;
+
+ case AtomicRMWInst::Xor:
+ // The new value we will be contributing to the atomic operation is the
+ // old value times the parity of the number of active lanes.
+ Value *const Ctpop = B.CreateIntCast(
+ B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
+ NewV = B.CreateMul(V, B.CreateAnd(Ctpop, 1));
+ break;
}
}
// We only want a single lane to enter our new control flow, and we do this
// by checking if there are any active lanes below us. Only one lane will
// have 0 active lanes below us, so that will be the only one to progress.
- Value *const Cond = B.CreateICmpEQ(MbcntCast, B.getIntN(TyBitWidth, 0));
+ Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0));
// Store I's original basic block before we split the block.
BasicBlock *const EntryBB = I.getParent();
@@ -502,14 +536,19 @@ void AMDGPUAtomicOptimizer::optimizeAtom
llvm_unreachable("Unhandled atomic op");
case AtomicRMWInst::Add:
case AtomicRMWInst::Sub:
- LaneOffset = B.CreateMul(V, MbcntCast);
+ LaneOffset = B.CreateMul(V, Mbcnt);
break;
+ case AtomicRMWInst::And:
+ case AtomicRMWInst::Or:
case AtomicRMWInst::Max:
case AtomicRMWInst::Min:
case AtomicRMWInst::UMax:
case AtomicRMWInst::UMin:
LaneOffset = B.CreateSelect(Cond, Identity, V);
break;
+ case AtomicRMWInst::Xor:
+ LaneOffset = B.CreateMul(V, B.CreateAnd(Mbcnt, 1));
+ break;
}
}
Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
Modified: llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll?rev=366323&r1=366322&r2=366323&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll Wed Jul 17 06:40:03 2019
@@ -195,6 +195,42 @@ entry:
ret void
}
+; GCN-LABEL: and_i32_varying:
+; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
+; GFX8MORE: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
+define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
+entry:
+ %lane = call i32 @llvm.amdgcn.workitem.id.x()
+ %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel
+ store i32 %old, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: or_i32_varying:
+; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
+; GFX8MORE: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
+define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
+entry:
+ %lane = call i32 @llvm.amdgcn.workitem.id.x()
+ %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel
+ store i32 %old, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: xor_i32_varying:
+; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
+; GFX8MORE: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
+define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
+entry:
+ %lane = call i32 @llvm.amdgcn.workitem.id.x()
+ %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel
+ store i32 %old, i32 addrspace(1)* %out
+ ret void
+}
+
; GCN-LABEL: max_i32_varying:
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
More information about the llvm-commits
mailing list