[llvm] 9d08f27 - [AMDGPU] Use reductions instead of scans in the atomic optimizer

Fri Mar 26 08:41:47 PDT 2021

Author: Jay Foad
Date: 2021-03-26T15:38:14Z
New Revision: 9d08f276d79b59e3d1ad3db3db19077284524ca3

URL: https://github.com/llvm/llvm-project/commit/9d08f276d79b59e3d1ad3db3db19077284524ca3
DIFF: https://github.com/llvm/llvm-project/commit/9d08f276d79b59e3d1ad3db3db19077284524ca3.diff

LOG: [AMDGPU] Use reductions instead of scans in the atomic optimizer

If the result of an atomic operation is not used then it can be more
efficient to build a reduction across all lanes instead of a scan. Do
this for GFX10, where the permlanex16 instruction makes it viable. For
wave64 this saves a couple of dpp operations. For wave32 it saves one
readlane (which are generally bad for performance) and one dpp
operation.

Differential Revision: https://reviews.llvm.org/D98953

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
    llvm/lib/Target/AMDGPU/GCNSubtarget.h
    llvm/lib/Target/AMDGPU/SIDefines.h
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 147c88d82cf8a..3e9fdcb1618e6 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -48,6 +48,8 @@ class AMDGPUAtomicOptimizer : public FunctionPass,
   const GCNSubtarget *ST;
   bool IsPixelShader;
 
+  Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
+                        Value *const Identity) const;
   Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
                    Value *const Identity) const;
   Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const;
@@ -279,6 +281,45 @@ static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
   return B.CreateSelect(Cond, LHS, RHS);
 }
 
+// Use the builder to create a reduction of V across the wavefront, with all
+// lanes active, returning the same result in all lanes.
+Value *AMDGPUAtomicOptimizer::buildReduction(IRBuilder<> &B,
+                                             AtomicRMWInst::BinOp Op, Value *V,
+                                             Value *const Identity) const {
+  Type *const Ty = V->getType();
+  Module *M = B.GetInsertBlock()->getModule();
+  Function *UpdateDPP =
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
+
+  // Reduce within each row of 16 lanes.
+  for (unsigned Idx = 0; Idx < 4; Idx++) {
+    V = buildNonAtomicBinOp(
+        B, Op, V,
+        B.CreateCall(UpdateDPP,
+                     {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx),
+                      B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
+  }
+
+  // Reduce within each pair of rows (i.e. 32 lanes).
+  assert(ST->hasPermLaneX16());
+  V = buildNonAtomicBinOp(
+      B, Op, V,
+      B.CreateIntrinsic(
+          Intrinsic::amdgcn_permlanex16, {},
+          {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}));
+
+  if (ST->isWave32())
+    return V;
+
+  // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
+  // combine them with a scalar operation.
+  Function *ReadLane =
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+  Value *const Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
+  Value *const Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
+  return buildNonAtomicBinOp(B, Op, Lane0, Lane32);
+}
+
 // Use the builder to create an inclusive scan of V across the wavefront, with
 // all lanes active.
 Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
@@ -313,6 +354,7 @@ Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
 
     // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
     // 48..63).
+    assert(ST->hasPermLaneX16());
     Value *const PermX = B.CreateIntrinsic(
         Intrinsic::amdgcn_permlanex16, {},
         {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
@@ -489,16 +531,24 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
 
     const AtomicRMWInst::BinOp ScanOp =
         Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
-    NewV = buildScan(B, ScanOp, NewV, Identity);
-    if (NeedResult)
-      ExclScan = buildShiftRight(B, NewV, Identity);
-
-    // Read the value from the last lane, which has accumlated the values of
-    // each active lane in the wavefront. This will be our new value which we
-    // will provide to the atomic operation.
-    Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
-    assert(TyBitWidth == 32);
-    NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {NewV, LastLaneIdx});
+    if (!NeedResult && ST->hasPermLaneX16()) {
+      // On GFX10 the permlanex16 instruction helps us build a reduction without
+      // too many readlanes and writelanes, which are generally bad for
+      // performance.
+      NewV = buildReduction(B, ScanOp, NewV, Identity);
+    } else {
+      NewV = buildScan(B, ScanOp, NewV, Identity);
+      if (NeedResult)
+        ExclScan = buildShiftRight(B, NewV, Identity);
+
+      // Read the value from the last lane, which has accumlated the values of
+      // each active lane in the wavefront. This will be our new value which we
+      // will provide to the atomic operation.
+      Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
+      assert(TyBitWidth == 32);
+      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
+                               {NewV, LastLaneIdx});
+    }
 
     // Finally mark the readlanes in the WWM section.
     NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);

diff  --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 5421e96642fc9..415b1cb4854b3 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -811,6 +811,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return GFX8Insts;
   }
 
+  /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
+  bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
+
   bool hasDPP() const {
     return HasDPP;
   }

diff  --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 9ef87fec0ca11..b6abfdf53efc3 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -671,6 +671,7 @@ enum SDWA9EncValues : unsigned {
 
 namespace DPP {
 
+// clang-format off
 enum DppCtrl : unsigned {
   QUAD_PERM_FIRST   = 0,
   QUAD_PERM_ID      = 0xE4, // identity permutation
@@ -707,12 +708,15 @@ enum DppCtrl : unsigned {
   DPP_UNUSED8_LAST  = 0x14F,
   ROW_NEWBCAST_FIRST= 0x150,
   ROW_NEWBCAST_LAST = 0x15F,
+  ROW_SHARE0        = 0x150,
   ROW_SHARE_FIRST   = 0x150,
   ROW_SHARE_LAST    = 0x15F,
+  ROW_XMASK0        = 0x160,
   ROW_XMASK_FIRST   = 0x160,
   ROW_XMASK_LAST    = 0x16F,
   DPP_LAST          = ROW_XMASK_LAST
 };
+// clang-format on
 
 enum DppFiMode {
   DPP_FI_0  = 0,

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 5590c4ee47bdd..a3166e46501cf 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -663,23 +663,21 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_not_b64 exec, exec
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s2, v1, 31
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-NEXT:    v_readlane_b32 s2, v1, 63
+; GFX1064-NEXT:    v_readlane_b32 s2, v1, 0
+; GFX1064-NEXT:    v_readlane_b32 s3, v1, 32
 ; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
-; GFX1064-NEXT:    s_mov_b32 s0, s2
+; GFX1064-NEXT:    s_add_i32 s0, s2, s3
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB3_2
@@ -701,26 +699,24 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032-NEXT:    v_readlane_b32 s1, v1, 31
+; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s0, s1
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
+; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz BB3_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32 at abs32@lo
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s0
+; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var32 at abs32@lo
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    ds_add_u32 v0, v3
+; GFX1032-NEXT:    ds_add_u32 v3, v0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  BB3_2:
@@ -1847,23 +1843,21 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_not_b64 exec, exec
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s2, v1, 31
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-NEXT:    v_readlane_b32 s2, v1, 63
+; GFX1064-NEXT:    v_readlane_b32 s2, v1, 0
+; GFX1064-NEXT:    v_readlane_b32 s3, v1, 32
 ; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
-; GFX1064-NEXT:    s_mov_b32 s0, s2
+; GFX1064-NEXT:    s_add_i32 s0, s2, s3
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB10_2
@@ -1885,26 +1879,24 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032-NEXT:    v_readlane_b32 s1, v1, 31
+; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s0, s1
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
+; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz BB10_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    v_mov_b32_e32 v0, local_var32 at abs32@lo
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s0
+; GFX1032-NEXT:    v_mov_b32_e32 v3, local_var32 at abs32@lo
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    ds_sub_u32 v0, v3
+; GFX1032-NEXT:    ds_sub_u32 v3, v0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  BB10_2: