[llvm] [AMDGPU] Exclude certain opcodes from being marked as single use (PR #91802)

via llvm-commits llvm-commits at lists.llvm.org
Fri May 10 13:26:34 PDT 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Scott Egerton (ScottEgerton)

<details>
<summary>Changes</summary>

The s_singleuse_vdst instruction is used to mark regions of instructions that
produce values that have only one use.
Certain instructions take more than one cycle to execute, resulting in regions
being incorrectly marked.
This patch excludes these multi-cycle instructions from being marked as either
producing single use values or consuming single use values or both depending
on the instruction.


---
Full diff: https://github.com/llvm/llvm-project/pull/91802.diff


2 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp (+150-2) 
- (modified) llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir (+156-1) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
index b78952ca3a622..0571b49ba7700 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
@@ -132,6 +132,153 @@ class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
 
   AMDGPUInsertSingleUseVDST() : MachineFunctionPass(ID) {}
 
+  static bool IsValidOpcode(const MachineInstr &MI) {
+    switch (MI.getOpcode()) {
+    case AMDGPU::V_MOVRELSD_B32_e32:
+    case AMDGPU::V_MOVRELSD_B32_e64:
+    case AMDGPU::V_SWAPREL_B32:
+    case AMDGPU::V_PERMLANE64_B32:
+    case AMDGPU::V_PERMLANE16_B32_e64:
+    case AMDGPU::V_PERMLANE16_B32_gfx10:
+    case AMDGPU::V_PERMLANEX16_B32_e64:
+    case AMDGPU::V_PERMLANEX16_B32_gfx10:
+    case AMDGPU::V_WRITELANE_B32:
+      return false;
+    default:
+      if (SIInstrInfo::isDPP(MI)) {
+        switch (MI.getOpcode()) {
+        case AMDGPU::V_INTERP_MOV_F32:
+        case AMDGPU::V_INTERP_P1_F32_16bank:
+        case AMDGPU::V_INTERP_P1_F32:
+        case AMDGPU::V_INTERP_P2_F32:
+        case AMDGPU::V_INTERP_MOV_F32_e64:
+        case AMDGPU::V_INTERP_P10_F16_F32_inreg:
+        case AMDGPU::V_INTERP_P10_F32_inreg:
+        case AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg:
+        case AMDGPU::V_INTERP_P1_F32_e64_vi:
+        case AMDGPU::V_INTERP_P1LL_F16_vi:
+        case AMDGPU::V_INTERP_P1LV_F16_vi:
+        case AMDGPU::V_INTERP_P2_F16_vi:
+        case AMDGPU::V_INTERP_P2_F16_F32_inreg:
+        case AMDGPU::V_INTERP_P2_F32_inreg:
+        case AMDGPU::V_INTERP_P2_F32_e64:
+        case AMDGPU::V_INTERP_P2_LEGACY_F16_gfx9:
+        case AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg:
+          return true;
+        default:
+          return false;
+        }
+      }
+      return true;
+    }
+  }
+
+  static bool IsValidConsumerOpcode(const MachineInstr &MI) {
+    switch (MI.getOpcode()) {
+    case AMDGPU::V_MOVRELS_B32_e32:
+    case AMDGPU::V_MOVRELS_B32_e64:
+    case AMDGPU::V_READFIRSTLANE_B32:
+    case AMDGPU::V_READLANE_B32:
+    case AMDGPU::V_MAD_I64_I32_vi:
+    case AMDGPU::V_MAD_U64_U32_vi:
+    case AMDGPU::V_ASHRREV_I64_vi:
+    case AMDGPU::V_LSHLREV_B64_vi:
+    case AMDGPU::V_LSHRREV_B64_vi:
+    case AMDGPU::V_MQSAD_PK_U16_U8_vi:
+    case AMDGPU::V_MQSAD_U32_U8_vi:
+    case AMDGPU::V_QSAD_PK_U16_U8_vi:
+    case AMDGPU::V_CMPX_EQ_I64_e32:
+    case AMDGPU::V_CMPX_EQ_I64_e64:
+    case AMDGPU::V_CMPX_EQ_U64_e32:
+    case AMDGPU::V_CMPX_EQ_U64_e64:
+    case AMDGPU::V_CMPX_F_I64_e32:
+    case AMDGPU::V_CMPX_F_I64_e64:
+    case AMDGPU::V_CMPX_F_U64_e32:
+    case AMDGPU::V_CMPX_F_U64_e64:
+    case AMDGPU::V_CMPX_GE_I64_e32:
+    case AMDGPU::V_CMPX_GE_I64_e64:
+    case AMDGPU::V_CMPX_GE_U64_e32:
+    case AMDGPU::V_CMPX_GE_U64_e64:
+    case AMDGPU::V_CMPX_GT_I64_e32:
+    case AMDGPU::V_CMPX_GT_I64_e64:
+    case AMDGPU::V_CMPX_GT_U64_e32:
+    case AMDGPU::V_CMPX_GT_U64_e64:
+    case AMDGPU::V_CMPX_LE_I64_e32:
+    case AMDGPU::V_CMPX_LE_I64_e64:
+    case AMDGPU::V_CMPX_LE_U64_e32:
+    case AMDGPU::V_CMPX_LE_U64_e64:
+    case AMDGPU::V_CMPX_LT_I64_e32:
+    case AMDGPU::V_CMPX_LT_I64_e64:
+    case AMDGPU::V_CMPX_LT_U64_e32:
+    case AMDGPU::V_CMPX_LT_U64_e64:
+    case AMDGPU::V_CMPX_NE_I64_e32:
+    case AMDGPU::V_CMPX_NE_I64_e64:
+    case AMDGPU::V_CMPX_NE_U64_e32:
+    case AMDGPU::V_CMPX_NE_U64_e64:
+    case AMDGPU::V_CMPX_T_I64_e32:
+    case AMDGPU::V_CMPX_T_I64_e64:
+    case AMDGPU::V_CMPX_T_U64_e32:
+    case AMDGPU::V_CMPX_T_U64_e64:
+    case AMDGPU::V_CMP_EQ_I64_e32:
+    case AMDGPU::V_CMP_EQ_I64_e64:
+    case AMDGPU::V_CMP_EQ_U64_e32:
+    case AMDGPU::V_CMP_EQ_U64_e64:
+    case AMDGPU::V_CMP_F_I64_e32:
+    case AMDGPU::V_CMP_F_I64_e64:
+    case AMDGPU::V_CMP_F_U64_e32:
+    case AMDGPU::V_CMP_F_U64_e64:
+    case AMDGPU::V_CMP_GE_I64_e32:
+    case AMDGPU::V_CMP_GE_I64_e64:
+    case AMDGPU::V_CMP_GE_U64_e32:
+    case AMDGPU::V_CMP_GE_U64_e64:
+    case AMDGPU::V_CMP_GT_I64_e32:
+    case AMDGPU::V_CMP_GT_I64_e64:
+    case AMDGPU::V_CMP_GT_U64_e32:
+    case AMDGPU::V_CMP_GT_U64_e64:
+    case AMDGPU::V_CMP_LE_I64_e32:
+    case AMDGPU::V_CMP_LE_I64_e64:
+    case AMDGPU::V_CMP_LE_U64_e32:
+    case AMDGPU::V_CMP_LE_U64_e64:
+    case AMDGPU::V_CMP_LT_I64_e32:
+    case AMDGPU::V_CMP_LT_I64_e64:
+    case AMDGPU::V_CMP_LT_U64_e32:
+    case AMDGPU::V_CMP_LT_U64_e64:
+    case AMDGPU::V_CMP_NE_I64_e32:
+    case AMDGPU::V_CMP_NE_I64_e64:
+    case AMDGPU::V_CMP_NE_U64_e32:
+    case AMDGPU::V_CMP_NE_U64_e64:
+    case AMDGPU::V_CMP_T_I64_e32:
+    case AMDGPU::V_CMP_T_I64_e64:
+    case AMDGPU::V_CMP_T_U64_e32:
+    case AMDGPU::V_CMP_T_U64_e64:
+    case AMDGPU::V_MUL_LO_U32_e64:
+    case AMDGPU::V_MUL_HI_U32_e64:
+    case AMDGPU::V_MUL_HI_I32_e64:
+    case AMDGPU::V_SWAP_B32:
+    case AMDGPU::V_DOT4_I32_I8:
+    case AMDGPU::V_DOT4_U32_U8:
+      return false;
+    default:
+      return IsValidOpcode(MI);
+    }
+  }
+
+  static bool IsValidProducerOpcode(const MachineInstr &MI) {
+    // Only VALU instructions are valid producers.
+    if (!SIInstrInfo::isVALU(MI))
+      return false;
+
+    // VALU instructions that take multiple cycles should not be marked as
+    // single use.
+    switch (MI.getOpcode()) {
+    case AMDGPU::V_MOVRELD_B32_e32:
+    case AMDGPU::V_MOVRELD_B32_e64:
+      return false;
+    default:
+      return IsValidOpcode(MI);
+    }
+  }
+
   void insertSingleUseInstructions(
       ArrayRef<std::pair<unsigned, MachineInstr *>> SingleUseProducers) const {
     SmallVector<SingleUseInstruction> Instructions;
@@ -214,12 +361,13 @@ class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
           RegisterUseCount[Unit]++;
 
         // Do not attempt to optimise across exec mask changes.
-        if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
+        if (MI.modifiesRegister(AMDGPU::EXEC, TRI) ||
+            !IsValidConsumerOpcode(MI)) {
           for (auto &UsedReg : RegisterUseCount)
             UsedReg.second = 2;
         }
 
-        if (!SIInstrInfo::isVALU(MI))
+        if (!IsValidProducerOpcode(MI))
           continue;
         if (AllProducerOperandsAreSingleUse) {
           SingleUseProducerPositions.push_back({VALUInstrCount, &MI});
diff --git a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
index f2a5139b73b10..129e577fb8a5a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
@@ -1,6 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass=amdgpu-insert-single-use-vdst %s -o - | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass=amdgpu-insert-single-use-vdst %s -o - | FileCheck %s
 
 # One single-use producer.
 ---
@@ -1238,3 +1237,159 @@ body: |
     liveins: $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr30, $vgpr31, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36
 
 ...
+
+# Tests for multi-cycle instructions that are explicitly excluded.
+
+# Valid producers but invalid consumer opcodes.
+---
+name: v_mul_hi_u32_e64
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: v_mul_hi_u32_e64
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr2 = V_MUL_HI_U32_e64 $vgpr0, $vgpr1, implicit $exec
+  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr3
+  bb.0:
+    liveins: $vgpr0
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr2 = V_MUL_HI_U32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec
+  bb.1:
+    liveins: $vgpr0, $vgpr3
+...
+
+---
+name: v_cmpx_t_u64_e64
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: v_cmpx_t_u64_e64
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $sgpr0 = V_CMPX_EQ_U64_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, implicit-def $exec, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr0
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $sgpr0 = V_CMPX_EQ_U64_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, implicit-def $exec, implicit $exec
+    S_BRANCH %bb.1
+  bb.1:
+    liveins: $vgpr0
+...
+
+# Invalid producers but valid consumer opcodes.
+---
+name: v_movereld_b32_e32
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: v_movereld_b32_e32
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $m0 = S_MOV_B32 0
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   V_MOVRELD_B32_e32 $vgpr2, $vgpr1, implicit $m0, implicit $exec, implicit-def $vgpr1_vgpr2, implicit undef $vgpr1_vgpr2(tied-def 4)
+  ; CHECK-NEXT:   $vgpr3 = V_ADD_U32_e32 $vgpr2, $vgpr1, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr3
+  bb.0:
+    liveins: $vgpr0, $vgpr2
+    $m0 = S_MOV_B32 0
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    V_MOVRELD_B32_e32 $vgpr2, $vgpr1, implicit $m0, implicit $exec, implicit-def $vgpr1_vgpr2, implicit undef $vgpr1_vgpr2(tied-def 4)
+    $vgpr3 = V_ADD_U32_e32 $vgpr2, $vgpr1, implicit $exec
+  bb.1:
+    liveins: $vgpr3
+...
+
+# Invalid producers and invalid consumer opcodes.
+---
+name: v_writelane_b32
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: v_writelane_b32
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr1 = V_WRITELANE_B32 $sgpr0, 0, $vgpr1
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr0
+  bb.0:
+    liveins: $vgpr0, $sgpr0
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr1 = V_WRITELANE_B32 $sgpr0, 0, $vgpr1
+    $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec
+  bb.1:
+    liveins: $vgpr0
+...
+
+# DPP instructions cannot be single use producers or consumers
+---
+name: V_ADD_NC_U32_dpp
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: V_ADD_NC_U32_dpp
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vcc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
+  ; CHECK-NEXT:   $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
+  ; CHECK-NEXT:   $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr0
+  bb.0:
+    liveins: $vgpr0, $vcc
+    $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
+    $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
+    $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
+  bb.1:
+    liveins: $vgpr0
+...
+
+# Exception to the rule that dpp instructions
+# cannot be single use producers or consumers
+---
+name: V_INTERP_MOV_F32
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: V_INTERP_MOV_F32
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr0 = V_INTERP_MOV_F32 0, 0, 0, implicit $mode, implicit $m0, implicit $exec
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr1
+  bb.0:
+    $vgpr0 = V_INTERP_MOV_F32 0, 0, 0, implicit $mode, implicit $m0, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  bb.1:
+    liveins: $vgpr1
+...
+

``````````

</details>


https://github.com/llvm/llvm-project/pull/91802


More information about the llvm-commits mailing list