[llvm] 9d08f27 - [AMDGPU] Use reductions instead of scans in the atomic optimizer
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 26 08:41:47 PDT 2021
Author: Jay Foad
Date: 2021-03-26T15:38:14Z
New Revision: 9d08f276d79b59e3d1ad3db3db19077284524ca3
URL: https://github.com/llvm/llvm-project/commit/9d08f276d79b59e3d1ad3db3db19077284524ca3
DIFF: https://github.com/llvm/llvm-project/commit/9d08f276d79b59e3d1ad3db3db19077284524ca3.diff
LOG: [AMDGPU] Use reductions instead of scans in the atomic optimizer
If the result of an atomic operation is not used then it can be more
efficient to build a reduction across all lanes instead of a scan. Do
this for GFX10, where the permlanex16 instruction makes it viable. For
wave64 this saves a couple of dpp operations. For wave32 it saves one
readlane (which are generally bad for performance) and one dpp
operation.
Differential Revision: https://reviews.llvm.org/D98953
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
llvm/lib/Target/AMDGPU/GCNSubtarget.h
llvm/lib/Target/AMDGPU/SIDefines.h
llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 147c88d82cf8a..3e9fdcb1618e6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -48,6 +48,8 @@ class AMDGPUAtomicOptimizer : public FunctionPass,
const GCNSubtarget *ST;
bool IsPixelShader;
+ Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
+ Value *const Identity) const;
Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
Value *const Identity) const;
Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const;
@@ -279,6 +281,45 @@ static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
return B.CreateSelect(Cond, LHS, RHS);
}
+// Use the builder to create a reduction of V across the wavefront, with all
+// lanes active, returning the same result in all lanes.
+Value *AMDGPUAtomicOptimizer::buildReduction(IRBuilder<> &B,
+ AtomicRMWInst::BinOp Op, Value *V,
+ Value *const Identity) const {
+ Type *const Ty = V->getType();
+ Module *M = B.GetInsertBlock()->getModule();
+ Function *UpdateDPP =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
+
+ // Reduce within each row of 16 lanes.
+ for (unsigned Idx = 0; Idx < 4; Idx++) {
+ V = buildNonAtomicBinOp(
+ B, Op, V,
+ B.CreateCall(UpdateDPP,
+ {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx),
+ B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
+ }
+
+ // Reduce within each pair of rows (i.e. 32 lanes).
+ assert(ST->hasPermLaneX16());
+ V = buildNonAtomicBinOp(
+ B, Op, V,
+ B.CreateIntrinsic(
+ Intrinsic::amdgcn_permlanex16, {},
+ {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}));
+
+ if (ST->isWave32())
+ return V;
+
+ // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
+ // combine them with a scalar operation.
+ Function *ReadLane =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+ Value *const Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
+ Value *const Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
+ return buildNonAtomicBinOp(B, Op, Lane0, Lane32);
+}
+
// Use the builder to create an inclusive scan of V across the wavefront, with
// all lanes active.
Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
@@ -313,6 +354,7 @@ Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
// Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
// 48..63).
+ assert(ST->hasPermLaneX16());
Value *const PermX = B.CreateIntrinsic(
Intrinsic::amdgcn_permlanex16, {},
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
@@ -489,16 +531,24 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
const AtomicRMWInst::BinOp ScanOp =
Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
- NewV = buildScan(B, ScanOp, NewV, Identity);
- if (NeedResult)
- ExclScan = buildShiftRight(B, NewV, Identity);
-
- // Read the value from the last lane, which has accumlated the values of
- // each active lane in the wavefront. This will be our new value which we
- // will provide to the atomic operation.
- Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
- assert(TyBitWidth == 32);
- NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {NewV, LastLaneIdx});
+ if (!NeedResult && ST->hasPermLaneX16()) {
+ // On GFX10 the permlanex16 instruction helps us build a reduction without
+ // too many readlanes and writelanes, which are generally bad for
+ // performance.
+ NewV = buildReduction(B, ScanOp, NewV, Identity);
+ } else {
+ NewV = buildScan(B, ScanOp, NewV, Identity);
+ if (NeedResult)
+ ExclScan = buildShiftRight(B, NewV, Identity);
+
+ // Read the value from the last lane, which has accumlated the values of
+ // each active lane in the wavefront. This will be our new value which we
+ // will provide to the atomic operation.
+ Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
+ assert(TyBitWidth == 32);
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
+ {NewV, LastLaneIdx});
+ }
// Finally mark the readlanes in the WWM section.
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 5421e96642fc9..415b1cb4854b3 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -811,6 +811,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return GFX8Insts;
}
+ /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
+ bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
+
bool hasDPP() const {
return HasDPP;
}
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 9ef87fec0ca11..b6abfdf53efc3 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -671,6 +671,7 @@ enum SDWA9EncValues : unsigned {
namespace DPP {
+// clang-format off
enum DppCtrl : unsigned {
QUAD_PERM_FIRST = 0,
QUAD_PERM_ID = 0xE4, // identity permutation
@@ -707,12 +708,15 @@ enum DppCtrl : unsigned {
DPP_UNUSED8_LAST = 0x14F,
ROW_NEWBCAST_FIRST= 0x150,
ROW_NEWBCAST_LAST = 0x15F,
+ ROW_SHARE0 = 0x150,
ROW_SHARE_FIRST = 0x150,
ROW_SHARE_LAST = 0x15F,
+ ROW_XMASK0 = 0x160,
ROW_XMASK_FIRST = 0x160,
ROW_XMASK_LAST = 0x16F,
DPP_LAST = ROW_XMASK_LAST
};
+// clang-format on
enum DppFiMode {
DPP_FI_0 = 0,
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 5590c4ee47bdd..a3166e46501cf 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -663,23 +663,21 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: s_not_b64 exec, exec
; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064-NEXT: v_mov_b32_e32 v2, v1
; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064-NEXT: v_readlane_b32 s2, v1, 31
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX1064-NEXT: s_mov_b64 exec, s[0:1]
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-NEXT: v_readlane_b32 s2, v1, 63
+; GFX1064-NEXT: v_readlane_b32 s2, v1, 0
+; GFX1064-NEXT: v_readlane_b32 s3, v1, 32
; GFX1064-NEXT: s_mov_b64 exec, s[0:1]
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
-; GFX1064-NEXT: s_mov_b32 s0, s2
+; GFX1064-NEXT: s_add_i32 s0, s2, s3
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz BB3_2
@@ -701,26 +699,24 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032-NEXT: v_mov_b32_e32 v2, v1
; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032-NEXT: v_readlane_b32 s1, v1, 31
+; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX1032-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s0, s1
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0
+; GFX1032-NEXT: v_mov_b32_e32 v0, v1
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB3_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
-; GFX1032-NEXT: v_mov_b32_e32 v3, s0
+; GFX1032-NEXT: v_mov_b32_e32 v3, local_var32 at abs32@lo
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT: ds_add_u32 v0, v3
+; GFX1032-NEXT: ds_add_u32 v3, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: BB3_2:
@@ -1847,23 +1843,21 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: s_not_b64 exec, exec
; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064-NEXT: v_mov_b32_e32 v2, v1
; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064-NEXT: v_readlane_b32 s2, v1, 31
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX1064-NEXT: s_mov_b64 exec, s[0:1]
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-NEXT: v_readlane_b32 s2, v1, 63
+; GFX1064-NEXT: v_readlane_b32 s2, v1, 0
+; GFX1064-NEXT: v_readlane_b32 s3, v1, 32
; GFX1064-NEXT: s_mov_b64 exec, s[0:1]
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
-; GFX1064-NEXT: s_mov_b32 s0, s2
+; GFX1064-NEXT: s_add_i32 s0, s2, s3
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz BB10_2
@@ -1885,26 +1879,24 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032-NEXT: v_mov_b32_e32 v2, v1
; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032-NEXT: v_readlane_b32 s1, v1, 31
+; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX1032-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s0, s1
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0
+; GFX1032-NEXT: v_mov_b32_e32 v0, v1
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB10_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
-; GFX1032-NEXT: v_mov_b32_e32 v3, s0
+; GFX1032-NEXT: v_mov_b32_e32 v3, local_var32 at abs32@lo
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT: ds_sub_u32 v0, v3
+; GFX1032-NEXT: ds_sub_u32 v3, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: BB10_2:
More information about the llvm-commits
mailing list