[PATCH] D64207: [AMDGPU] DPP combiner: recognize identities for more opcodes

Thu Jul 4 07:05:52 PDT 2019

foad created this revision.
foad added reviewers: arsenm, vpykhtin.
Herald added subscribers: jfb, MaskRay, kbarton, hiraditya, t-tye, tpr, dstuttard, yaxunl, nhaehnle, wdng, jvesely, nemanjai, kzhuravl.
Herald added a project: LLVM.

This allows the DPP combiner to kick in more often. For example the
exclusive scan generated by the atomic optimizer for a divergent atomic
add used to look like this:

  v_mov_b32_e32 v3, v1
  v_mov_b32_e32 v5, v1
  v_mov_b32_e32 v6, v1
  v_mov_b32_dpp v3, v2  wave_shr:1 row_mask:0xf bank_mask:0xf
  s_nop 1
  v_add_u32_dpp v4, v3, v3  row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
  v_mov_b32_dpp v5, v3  row_shr:2 row_mask:0xf bank_mask:0xf
  v_mov_b32_dpp v6, v3  row_shr:3 row_mask:0xf bank_mask:0xf
  v_add3_u32 v3, v4, v5, v6
  v_mov_b32_e32 v4, v1
  s_nop 1
  v_mov_b32_dpp v4, v3  row_shr:4 row_mask:0xf bank_mask:0xe
  v_add_u32_e32 v3, v3, v4
  v_mov_b32_e32 v4, v1
  s_nop 1
  v_mov_b32_dpp v4, v3  row_shr:8 row_mask:0xf bank_mask:0xc
  v_add_u32_e32 v3, v3, v4
  v_mov_b32_e32 v4, v1
  s_nop 1
  v_mov_b32_dpp v4, v3  row_bcast:15 row_mask:0xa bank_mask:0xf
  v_add_u32_e32 v3, v3, v4
  s_nop 1
  v_mov_b32_dpp v1, v3  row_bcast:31 row_mask:0xc bank_mask:0xf
  v_add_u32_e32 v1, v3, v1
  v_add_u32_e32 v1, v2, v1
  v_readlane_b32 s0, v1, 63

But now most of the dpp movs are combined into adds:

  v_mov_b32_e32 v3, v1
  v_mov_b32_e32 v5, v1
  s_nop 0
  v_mov_b32_dpp v3, v2  wave_shr:1 row_mask:0xf bank_mask:0xf
  s_nop 1
  v_add_u32_dpp v4, v3, v3  row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
  v_mov_b32_dpp v5, v3  row_shr:2 row_mask:0xf bank_mask:0xf
  v_mov_b32_dpp v1, v3  row_shr:3 row_mask:0xf bank_mask:0xf
  v_add3_u32 v1, v4, v5, v1
  s_nop 1
  v_add_u32_dpp v1, v1, v1  row_shr:4 row_mask:0xf bank_mask:0xe
  s_nop 1
  v_add_u32_dpp v1, v1, v1  row_shr:8 row_mask:0xf bank_mask:0xc
  s_nop 1
  v_add_u32_dpp v1, v1, v1  row_bcast:15 row_mask:0xa bank_mask:0xf
  s_nop 1
  v_add_u32_dpp v1, v1, v1  row_bcast:31 row_mask:0xc bank_mask:0xf
  v_add_u32_e32 v1, v2, v1
  v_readlane_b32 s0, v1, 63

Also fix some typos in comments and debug output.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D64207

Files:
  llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp


Index: llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
===================================================================

--- llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -32,9 +32,9 @@
 // -> $combined_old = src1,
 //    $combined_bound_ctrl = DPP_BOUND_OFF
 //
-// Othervise cancel.
+// Otherwise cancel.
 //
-// The mov_dpp instruction should recide in the same BB as all it's uses
+// The mov_dpp instruction should reside in the same BB as all its uses
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
@@ -253,33 +253,46 @@
   switch (OrigMIOp) {
   default: break;
   case AMDGPU::V_ADD_U32_e32:
+  case AMDGPU::V_ADD_U32_e64:
   case AMDGPU::V_ADD_I32_e32:
+  case AMDGPU::V_ADD_I32_e64:
   case AMDGPU::V_OR_B32_e32:
+  case AMDGPU::V_OR_B32_e64:
   case AMDGPU::V_SUBREV_U32_e32:
+  case AMDGPU::V_SUBREV_U32_e64:
   case AMDGPU::V_SUBREV_I32_e32:
+  case AMDGPU::V_SUBREV_I32_e64:
   case AMDGPU::V_MAX_U32_e32:
+  case AMDGPU::V_MAX_U32_e64:
   case AMDGPU::V_XOR_B32_e32:
+  case AMDGPU::V_XOR_B32_e64:
     if (OldOpnd->getImm() == 0)
       return true;
     break;
   case AMDGPU::V_AND_B32_e32:
+  case AMDGPU::V_AND_B32_e64:
   case AMDGPU::V_MIN_U32_e32:
+  case AMDGPU::V_MIN_U32_e64:
     if (static_cast<uint32_t>(OldOpnd->getImm()) ==
         std::numeric_limits<uint32_t>::max())
       return true;
     break;
   case AMDGPU::V_MIN_I32_e32:
+  case AMDGPU::V_MIN_I32_e64:
     if (static_cast<int32_t>(OldOpnd->getImm()) ==
         std::numeric_limits<int32_t>::max())
       return true;
     break;
   case AMDGPU::V_MAX_I32_e32:
+  case AMDGPU::V_MAX_I32_e64:
     if (static_cast<int32_t>(OldOpnd->getImm()) ==
         std::numeric_limits<int32_t>::min())
       return true;
     break;
   case AMDGPU::V_MUL_I32_I24_e32:
+  case AMDGPU::V_MUL_I32_I24_e64:
   case AMDGPU::V_MUL_U32_U24_e32:
+  case AMDGPU::V_MUL_U32_U24_e64:
     if (OldOpnd->getImm() == 1)
       return true;
     break;
@@ -300,7 +313,7 @@
       return nullptr;
     }
     if (!isIdentityValue(OrigMI.getOpcode(), OldOpndValue)) {
-      LLVM_DEBUG(dbgs() << "  failed: old immediate ins't an identity\n");
+      LLVM_DEBUG(dbgs() << "  failed: old immediate isn't an identity\n");
       return nullptr;
     }
     CombOldVGPR = getRegSubRegPair(*Src1);


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D64207.208037.patch
Type: text/x-patch
Size: 2358 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20190704/dbb770c3/attachment.bin>