[llvm] [AMDGPU] Exclude certain opcodes from being marked as single use (PR #91802)

Scott Egerton via llvm-commits llvm-commits at lists.llvm.org
Fri May 10 13:26:04 PDT 2024


https://github.com/ScottEgerton created https://github.com/llvm/llvm-project/pull/91802

The s_singleuse_vdst instruction is used to mark regions of instructions that
produce values that have only one use.
Certain instructions take more than one cycle to execute, resulting in regions
being incorrectly marked.
This patch excludes these multi-cycle instructions from being marked as either
producing single use values or consuming single use values or both depending
on the instruction.


>From e3c7c24f784357910b695d365a8a81b4fa9f7adf Mon Sep 17 00:00:00 2001
From: Scott Egerton <scott.egerton at amd.com>
Date: Fri, 1 Mar 2024 10:24:57 +0000
Subject: [PATCH] [AMDGPU] Exclude certain opcodes from being marked as single
 use

The s_singleuse_vdst instruction is used to mark regions of instructions that
produce values that have only one use.
Certain instructions take more than one cycle to execute, resulting in regions
being incorrectly marked.
This patch excludes these multi-cycle instructions from being marked as either
producing single use values or consuming single use values or both depending
on the instruction.
---
 .../AMDGPU/AMDGPUInsertSingleUseVDST.cpp      | 152 ++++++++++++++++-
 .../CodeGen/AMDGPU/insert-singleuse-vdst.mir  | 157 +++++++++++++++++-
 2 files changed, 306 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
index b78952ca3a622..0571b49ba7700 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
@@ -132,6 +132,153 @@ class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
 
   AMDGPUInsertSingleUseVDST() : MachineFunctionPass(ID) {}
 
+  static bool IsValidOpcode(const MachineInstr &MI) {
+    switch (MI.getOpcode()) {
+    case AMDGPU::V_MOVRELSD_B32_e32:
+    case AMDGPU::V_MOVRELSD_B32_e64:
+    case AMDGPU::V_SWAPREL_B32:
+    case AMDGPU::V_PERMLANE64_B32:
+    case AMDGPU::V_PERMLANE16_B32_e64:
+    case AMDGPU::V_PERMLANE16_B32_gfx10:
+    case AMDGPU::V_PERMLANEX16_B32_e64:
+    case AMDGPU::V_PERMLANEX16_B32_gfx10:
+    case AMDGPU::V_WRITELANE_B32:
+      return false;
+    default:
+      if (SIInstrInfo::isDPP(MI)) {
+        switch (MI.getOpcode()) {
+        case AMDGPU::V_INTERP_MOV_F32:
+        case AMDGPU::V_INTERP_P1_F32_16bank:
+        case AMDGPU::V_INTERP_P1_F32:
+        case AMDGPU::V_INTERP_P2_F32:
+        case AMDGPU::V_INTERP_MOV_F32_e64:
+        case AMDGPU::V_INTERP_P10_F16_F32_inreg:
+        case AMDGPU::V_INTERP_P10_F32_inreg:
+        case AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg:
+        case AMDGPU::V_INTERP_P1_F32_e64_vi:
+        case AMDGPU::V_INTERP_P1LL_F16_vi:
+        case AMDGPU::V_INTERP_P1LV_F16_vi:
+        case AMDGPU::V_INTERP_P2_F16_vi:
+        case AMDGPU::V_INTERP_P2_F16_F32_inreg:
+        case AMDGPU::V_INTERP_P2_F32_inreg:
+        case AMDGPU::V_INTERP_P2_F32_e64:
+        case AMDGPU::V_INTERP_P2_LEGACY_F16_gfx9:
+        case AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg:
+          return true;
+        default:
+          return false;
+        }
+      }
+      return true;
+    }
+  }
+
+  static bool IsValidConsumerOpcode(const MachineInstr &MI) {
+    switch (MI.getOpcode()) {
+    case AMDGPU::V_MOVRELS_B32_e32:
+    case AMDGPU::V_MOVRELS_B32_e64:
+    case AMDGPU::V_READFIRSTLANE_B32:
+    case AMDGPU::V_READLANE_B32:
+    case AMDGPU::V_MAD_I64_I32_vi:
+    case AMDGPU::V_MAD_U64_U32_vi:
+    case AMDGPU::V_ASHRREV_I64_vi:
+    case AMDGPU::V_LSHLREV_B64_vi:
+    case AMDGPU::V_LSHRREV_B64_vi:
+    case AMDGPU::V_MQSAD_PK_U16_U8_vi:
+    case AMDGPU::V_MQSAD_U32_U8_vi:
+    case AMDGPU::V_QSAD_PK_U16_U8_vi:
+    case AMDGPU::V_CMPX_EQ_I64_e32:
+    case AMDGPU::V_CMPX_EQ_I64_e64:
+    case AMDGPU::V_CMPX_EQ_U64_e32:
+    case AMDGPU::V_CMPX_EQ_U64_e64:
+    case AMDGPU::V_CMPX_F_I64_e32:
+    case AMDGPU::V_CMPX_F_I64_e64:
+    case AMDGPU::V_CMPX_F_U64_e32:
+    case AMDGPU::V_CMPX_F_U64_e64:
+    case AMDGPU::V_CMPX_GE_I64_e32:
+    case AMDGPU::V_CMPX_GE_I64_e64:
+    case AMDGPU::V_CMPX_GE_U64_e32:
+    case AMDGPU::V_CMPX_GE_U64_e64:
+    case AMDGPU::V_CMPX_GT_I64_e32:
+    case AMDGPU::V_CMPX_GT_I64_e64:
+    case AMDGPU::V_CMPX_GT_U64_e32:
+    case AMDGPU::V_CMPX_GT_U64_e64:
+    case AMDGPU::V_CMPX_LE_I64_e32:
+    case AMDGPU::V_CMPX_LE_I64_e64:
+    case AMDGPU::V_CMPX_LE_U64_e32:
+    case AMDGPU::V_CMPX_LE_U64_e64:
+    case AMDGPU::V_CMPX_LT_I64_e32:
+    case AMDGPU::V_CMPX_LT_I64_e64:
+    case AMDGPU::V_CMPX_LT_U64_e32:
+    case AMDGPU::V_CMPX_LT_U64_e64:
+    case AMDGPU::V_CMPX_NE_I64_e32:
+    case AMDGPU::V_CMPX_NE_I64_e64:
+    case AMDGPU::V_CMPX_NE_U64_e32:
+    case AMDGPU::V_CMPX_NE_U64_e64:
+    case AMDGPU::V_CMPX_T_I64_e32:
+    case AMDGPU::V_CMPX_T_I64_e64:
+    case AMDGPU::V_CMPX_T_U64_e32:
+    case AMDGPU::V_CMPX_T_U64_e64:
+    case AMDGPU::V_CMP_EQ_I64_e32:
+    case AMDGPU::V_CMP_EQ_I64_e64:
+    case AMDGPU::V_CMP_EQ_U64_e32:
+    case AMDGPU::V_CMP_EQ_U64_e64:
+    case AMDGPU::V_CMP_F_I64_e32:
+    case AMDGPU::V_CMP_F_I64_e64:
+    case AMDGPU::V_CMP_F_U64_e32:
+    case AMDGPU::V_CMP_F_U64_e64:
+    case AMDGPU::V_CMP_GE_I64_e32:
+    case AMDGPU::V_CMP_GE_I64_e64:
+    case AMDGPU::V_CMP_GE_U64_e32:
+    case AMDGPU::V_CMP_GE_U64_e64:
+    case AMDGPU::V_CMP_GT_I64_e32:
+    case AMDGPU::V_CMP_GT_I64_e64:
+    case AMDGPU::V_CMP_GT_U64_e32:
+    case AMDGPU::V_CMP_GT_U64_e64:
+    case AMDGPU::V_CMP_LE_I64_e32:
+    case AMDGPU::V_CMP_LE_I64_e64:
+    case AMDGPU::V_CMP_LE_U64_e32:
+    case AMDGPU::V_CMP_LE_U64_e64:
+    case AMDGPU::V_CMP_LT_I64_e32:
+    case AMDGPU::V_CMP_LT_I64_e64:
+    case AMDGPU::V_CMP_LT_U64_e32:
+    case AMDGPU::V_CMP_LT_U64_e64:
+    case AMDGPU::V_CMP_NE_I64_e32:
+    case AMDGPU::V_CMP_NE_I64_e64:
+    case AMDGPU::V_CMP_NE_U64_e32:
+    case AMDGPU::V_CMP_NE_U64_e64:
+    case AMDGPU::V_CMP_T_I64_e32:
+    case AMDGPU::V_CMP_T_I64_e64:
+    case AMDGPU::V_CMP_T_U64_e32:
+    case AMDGPU::V_CMP_T_U64_e64:
+    case AMDGPU::V_MUL_LO_U32_e64:
+    case AMDGPU::V_MUL_HI_U32_e64:
+    case AMDGPU::V_MUL_HI_I32_e64:
+    case AMDGPU::V_SWAP_B32:
+    case AMDGPU::V_DOT4_I32_I8:
+    case AMDGPU::V_DOT4_U32_U8:
+      return false;
+    default:
+      return IsValidOpcode(MI);
+    }
+  }
+
+  static bool IsValidProducerOpcode(const MachineInstr &MI) {
+    // Only VALU instructions are valid producers.
+    if (!SIInstrInfo::isVALU(MI))
+      return false;
+
+    // VALU instructions that take multiple cycles should not be marked as
+    // single use.
+    switch (MI.getOpcode()) {
+    case AMDGPU::V_MOVRELD_B32_e32:
+    case AMDGPU::V_MOVRELD_B32_e64:
+      return false;
+    default:
+      return IsValidOpcode(MI);
+    }
+  }
+
   void insertSingleUseInstructions(
       ArrayRef<std::pair<unsigned, MachineInstr *>> SingleUseProducers) const {
     SmallVector<SingleUseInstruction> Instructions;
@@ -214,12 +361,13 @@ class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
           RegisterUseCount[Unit]++;
 
         // Do not attempt to optimise across exec mask changes.
-        if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
+        if (MI.modifiesRegister(AMDGPU::EXEC, TRI) ||
+            !IsValidConsumerOpcode(MI)) {
           for (auto &UsedReg : RegisterUseCount)
             UsedReg.second = 2;
         }
 
-        if (!SIInstrInfo::isVALU(MI))
+        if (!IsValidProducerOpcode(MI))
           continue;
         if (AllProducerOperandsAreSingleUse) {
           SingleUseProducerPositions.push_back({VALUInstrCount, &MI});
diff --git a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
index f2a5139b73b10..129e577fb8a5a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
@@ -1,6 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass=amdgpu-insert-single-use-vdst %s -o - | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass=amdgpu-insert-single-use-vdst %s -o - | FileCheck %s
 
 # One single-use producer.
 ---
@@ -1238,3 +1237,159 @@ body: |
     liveins: $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr30, $vgpr31, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36
 
 ...
+
+# Tests for multi-cycle instructions that are explicitly excluded.
+
+# Valid producers but invalid consumer opcodes.
+---
+name: v_mul_hi_u32_e64
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: v_mul_hi_u32_e64
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr2 = V_MUL_HI_U32_e64 $vgpr0, $vgpr1, implicit $exec
+  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr3
+  bb.0:
+    liveins: $vgpr0
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr2 = V_MUL_HI_U32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec
+  bb.1:
+    liveins: $vgpr0, $vgpr3
+...
+
+---
+name: v_cmpx_t_u64_e64
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: v_cmpx_t_u64_e64
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $sgpr0 = V_CMPX_EQ_U64_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, implicit-def $exec, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr0
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $sgpr0 = V_CMPX_EQ_U64_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, implicit-def $exec, implicit $exec
+    S_BRANCH %bb.1
+  bb.1:
+    liveins: $vgpr0
+...
+
+# Invalid producers but valid consumer opcodes.
+---
+name: v_movereld_b32_e32
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: v_movereld_b32_e32
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $m0 = S_MOV_B32 0
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   V_MOVRELD_B32_e32 $vgpr2, $vgpr1, implicit $m0, implicit $exec, implicit-def $vgpr1_vgpr2, implicit undef $vgpr1_vgpr2(tied-def 4)
+  ; CHECK-NEXT:   $vgpr3 = V_ADD_U32_e32 $vgpr2, $vgpr1, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr3
+  bb.0:
+    liveins: $vgpr0, $vgpr2
+    $m0 = S_MOV_B32 0
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    V_MOVRELD_B32_e32 $vgpr2, $vgpr1, implicit $m0, implicit $exec, implicit-def $vgpr1_vgpr2, implicit undef $vgpr1_vgpr2(tied-def 4)
+    $vgpr3 = V_ADD_U32_e32 $vgpr2, $vgpr1, implicit $exec
+  bb.1:
+    liveins: $vgpr3
+...
+
+# Invalid producers and invalid consumer opcodes.
+---
+name: v_writelane_b32
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: v_writelane_b32
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr1 = V_WRITELANE_B32 $sgpr0, 0, $vgpr1
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr0
+  bb.0:
+    liveins: $vgpr0, $sgpr0
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr1 = V_WRITELANE_B32 $sgpr0, 0, $vgpr1
+    $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec
+  bb.1:
+    liveins: $vgpr0
+...
+
+# DPP instructions cannot be single use producers or consumers
+---
+name: V_ADD_NC_U32_dpp
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: V_ADD_NC_U32_dpp
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vcc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
+  ; CHECK-NEXT:   $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
+  ; CHECK-NEXT:   $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr0
+  bb.0:
+    liveins: $vgpr0, $vcc
+    $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
+    $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
+    $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
+  bb.1:
+    liveins: $vgpr0
+...
+
+# Exception to the rule that dpp instructions
+# cannot be single use producers or consumers
+---
+name: V_INTERP_MOV_F32
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: V_INTERP_MOV_F32
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr0 = V_INTERP_MOV_F32 0, 0, 0, implicit $mode, implicit $m0, implicit $exec
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr1
+  bb.0:
+    $vgpr0 = V_INTERP_MOV_F32 0, 0, 0, implicit $mode, implicit $m0, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  bb.1:
+    liveins: $vgpr1
+...
+



More information about the llvm-commits mailing list