[llvm] [AMDGPU] Exclude certain opcodes from being marked as single use (PR #91802)

Scott Egerton via llvm-commits llvm-commits at lists.llvm.org
Tue Jun 11 13:28:17 PDT 2024


https://github.com/ScottEgerton updated https://github.com/llvm/llvm-project/pull/91802

>From e3c7c24f784357910b695d365a8a81b4fa9f7adf Mon Sep 17 00:00:00 2001
From: Scott Egerton <scott.egerton at amd.com>
Date: Fri, 1 Mar 2024 10:24:57 +0000
Subject: [PATCH 1/3] [AMDGPU] Exclude certain opcodes from being marked as
 single use

The s_singleuse_vdst instruction is used to mark regions of instructions that
produce values that have only one use.
Certain instructions take more than one cycle to execute, resulting in regions
being incorrectly marked.
This patch excludes these multi-cycle instructions from being marked as either
producing single use values or consuming single use values or both depending
on the instruction.
---
 .../AMDGPU/AMDGPUInsertSingleUseVDST.cpp      | 152 ++++++++++++++++-
 .../CodeGen/AMDGPU/insert-singleuse-vdst.mir  | 157 +++++++++++++++++-
 2 files changed, 306 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
index b78952ca3a622..0571b49ba7700 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
@@ -132,6 +132,153 @@ class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
 
   AMDGPUInsertSingleUseVDST() : MachineFunctionPass(ID) {}
 
+  static bool IsValidOpcode(const MachineInstr &MI) {
+    switch (MI.getOpcode()) {
+    case AMDGPU::V_MOVRELSD_B32_e32:
+    case AMDGPU::V_MOVRELSD_B32_e64:
+    case AMDGPU::V_SWAPREL_B32:
+    case AMDGPU::V_PERMLANE64_B32:
+    case AMDGPU::V_PERMLANE16_B32_e64:
+    case AMDGPU::V_PERMLANE16_B32_gfx10:
+    case AMDGPU::V_PERMLANEX16_B32_e64:
+    case AMDGPU::V_PERMLANEX16_B32_gfx10:
+    case AMDGPU::V_WRITELANE_B32:
+      return false;
+    default:
+      if (SIInstrInfo::isDPP(MI)) {
+        switch (MI.getOpcode()) {
+        case AMDGPU::V_INTERP_MOV_F32:
+        case AMDGPU::V_INTERP_P1_F32_16bank:
+        case AMDGPU::V_INTERP_P1_F32:
+        case AMDGPU::V_INTERP_P2_F32:
+        case AMDGPU::V_INTERP_MOV_F32_e64:
+        case AMDGPU::V_INTERP_P10_F16_F32_inreg:
+        case AMDGPU::V_INTERP_P10_F32_inreg:
+        case AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg:
+        case AMDGPU::V_INTERP_P1_F32_e64_vi:
+        case AMDGPU::V_INTERP_P1LL_F16_vi:
+        case AMDGPU::V_INTERP_P1LV_F16_vi:
+        case AMDGPU::V_INTERP_P2_F16_vi:
+        case AMDGPU::V_INTERP_P2_F16_F32_inreg:
+        case AMDGPU::V_INTERP_P2_F32_inreg:
+        case AMDGPU::V_INTERP_P2_F32_e64:
+        case AMDGPU::V_INTERP_P2_LEGACY_F16_gfx9:
+        case AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg:
+          return true;
+        default:
+          return false;
+        }
+      }
+      return true;
+    }
+  }
+
+  static bool IsValidConsumerOpcode(const MachineInstr &MI) {
+    switch (MI.getOpcode()) {
+    case AMDGPU::V_MOVRELS_B32_e32:
+    case AMDGPU::V_MOVRELS_B32_e64:
+    case AMDGPU::V_READFIRSTLANE_B32:
+    case AMDGPU::V_READLANE_B32:
+    case AMDGPU::V_MAD_I64_I32_vi:
+    case AMDGPU::V_MAD_U64_U32_vi:
+    case AMDGPU::V_ASHRREV_I64_vi:
+    case AMDGPU::V_LSHLREV_B64_vi:
+    case AMDGPU::V_LSHRREV_B64_vi:
+    case AMDGPU::V_MQSAD_PK_U16_U8_vi:
+    case AMDGPU::V_MQSAD_U32_U8_vi:
+    case AMDGPU::V_QSAD_PK_U16_U8_vi:
+    case AMDGPU::V_CMPX_EQ_I64_e32:
+    case AMDGPU::V_CMPX_EQ_I64_e64:
+    case AMDGPU::V_CMPX_EQ_U64_e32:
+    case AMDGPU::V_CMPX_EQ_U64_e64:
+    case AMDGPU::V_CMPX_F_I64_e32:
+    case AMDGPU::V_CMPX_F_I64_e64:
+    case AMDGPU::V_CMPX_F_U64_e32:
+    case AMDGPU::V_CMPX_F_U64_e64:
+    case AMDGPU::V_CMPX_GE_I64_e32:
+    case AMDGPU::V_CMPX_GE_I64_e64:
+    case AMDGPU::V_CMPX_GE_U64_e32:
+    case AMDGPU::V_CMPX_GE_U64_e64:
+    case AMDGPU::V_CMPX_GT_I64_e32:
+    case AMDGPU::V_CMPX_GT_I64_e64:
+    case AMDGPU::V_CMPX_GT_U64_e32:
+    case AMDGPU::V_CMPX_GT_U64_e64:
+    case AMDGPU::V_CMPX_LE_I64_e32:
+    case AMDGPU::V_CMPX_LE_I64_e64:
+    case AMDGPU::V_CMPX_LE_U64_e32:
+    case AMDGPU::V_CMPX_LE_U64_e64:
+    case AMDGPU::V_CMPX_LT_I64_e32:
+    case AMDGPU::V_CMPX_LT_I64_e64:
+    case AMDGPU::V_CMPX_LT_U64_e32:
+    case AMDGPU::V_CMPX_LT_U64_e64:
+    case AMDGPU::V_CMPX_NE_I64_e32:
+    case AMDGPU::V_CMPX_NE_I64_e64:
+    case AMDGPU::V_CMPX_NE_U64_e32:
+    case AMDGPU::V_CMPX_NE_U64_e64:
+    case AMDGPU::V_CMPX_T_I64_e32:
+    case AMDGPU::V_CMPX_T_I64_e64:
+    case AMDGPU::V_CMPX_T_U64_e32:
+    case AMDGPU::V_CMPX_T_U64_e64:
+    case AMDGPU::V_CMP_EQ_I64_e32:
+    case AMDGPU::V_CMP_EQ_I64_e64:
+    case AMDGPU::V_CMP_EQ_U64_e32:
+    case AMDGPU::V_CMP_EQ_U64_e64:
+    case AMDGPU::V_CMP_F_I64_e32:
+    case AMDGPU::V_CMP_F_I64_e64:
+    case AMDGPU::V_CMP_F_U64_e32:
+    case AMDGPU::V_CMP_F_U64_e64:
+    case AMDGPU::V_CMP_GE_I64_e32:
+    case AMDGPU::V_CMP_GE_I64_e64:
+    case AMDGPU::V_CMP_GE_U64_e32:
+    case AMDGPU::V_CMP_GE_U64_e64:
+    case AMDGPU::V_CMP_GT_I64_e32:
+    case AMDGPU::V_CMP_GT_I64_e64:
+    case AMDGPU::V_CMP_GT_U64_e32:
+    case AMDGPU::V_CMP_GT_U64_e64:
+    case AMDGPU::V_CMP_LE_I64_e32:
+    case AMDGPU::V_CMP_LE_I64_e64:
+    case AMDGPU::V_CMP_LE_U64_e32:
+    case AMDGPU::V_CMP_LE_U64_e64:
+    case AMDGPU::V_CMP_LT_I64_e32:
+    case AMDGPU::V_CMP_LT_I64_e64:
+    case AMDGPU::V_CMP_LT_U64_e32:
+    case AMDGPU::V_CMP_LT_U64_e64:
+    case AMDGPU::V_CMP_NE_I64_e32:
+    case AMDGPU::V_CMP_NE_I64_e64:
+    case AMDGPU::V_CMP_NE_U64_e32:
+    case AMDGPU::V_CMP_NE_U64_e64:
+    case AMDGPU::V_CMP_T_I64_e32:
+    case AMDGPU::V_CMP_T_I64_e64:
+    case AMDGPU::V_CMP_T_U64_e32:
+    case AMDGPU::V_CMP_T_U64_e64:
+    case AMDGPU::V_MUL_LO_U32_e64:
+    case AMDGPU::V_MUL_HI_U32_e64:
+    case AMDGPU::V_MUL_HI_I32_e64:
+    case AMDGPU::V_SWAP_B32:
+    case AMDGPU::V_DOT4_I32_I8:
+    case AMDGPU::V_DOT4_U32_U8:
+      return false;
+    default:
+      return IsValidOpcode(MI);
+    }
+  }
+
+  static bool IsValidProducerOpcode(const MachineInstr &MI) {
+    // Only VALU instructions are valid producers.
+    if (!SIInstrInfo::isVALU(MI))
+      return false;
+
+    // VALU instructions that take multiple cycles should not be marked as
+    // single use.
+    switch (MI.getOpcode()) {
+    case AMDGPU::V_MOVRELD_B32_e32:
+    case AMDGPU::V_MOVRELD_B32_e64:
+      return false;
+    default:
+      return IsValidOpcode(MI);
+    }
+  }
+
   void insertSingleUseInstructions(
       ArrayRef<std::pair<unsigned, MachineInstr *>> SingleUseProducers) const {
     SmallVector<SingleUseInstruction> Instructions;
@@ -214,12 +361,13 @@ class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
           RegisterUseCount[Unit]++;
 
         // Do not attempt to optimise across exec mask changes.
-        if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
+        if (MI.modifiesRegister(AMDGPU::EXEC, TRI) ||
+            !IsValidConsumerOpcode(MI)) {
           for (auto &UsedReg : RegisterUseCount)
             UsedReg.second = 2;
         }
 
-        if (!SIInstrInfo::isVALU(MI))
+        if (!IsValidProducerOpcode(MI))
           continue;
         if (AllProducerOperandsAreSingleUse) {
           SingleUseProducerPositions.push_back({VALUInstrCount, &MI});
diff --git a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
index f2a5139b73b10..129e577fb8a5a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
@@ -1,6 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass=amdgpu-insert-single-use-vdst %s -o - | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass=amdgpu-insert-single-use-vdst %s -o - | FileCheck %s
 
 # One single-use producer.
 ---
@@ -1238,3 +1237,159 @@ body: |
     liveins: $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr30, $vgpr31, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36
 
 ...
+
+# Tests for multi-cycle instructions that are explicitly excluded.
+
+# Valid producers but invalid consumer opcodes.
+---
+name: v_mul_hi_u32_e64
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: v_mul_hi_u32_e64
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr2 = V_MUL_HI_U32_e64 $vgpr0, $vgpr1, implicit $exec
+  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr3
+  bb.0:
+    liveins: $vgpr0
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr2 = V_MUL_HI_U32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec
+  bb.1:
+    liveins: $vgpr0, $vgpr3
+...
+
+---
+name: v_cmpx_t_u64_e64
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: v_cmpx_t_u64_e64
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $sgpr0 = V_CMPX_EQ_U64_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, implicit-def $exec, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr0
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $sgpr0 = V_CMPX_EQ_U64_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, implicit-def $exec, implicit $exec
+    S_BRANCH %bb.1
+  bb.1:
+    liveins: $vgpr0
+...
+
+# Invalid producers but valid consumer opcodes.
+---
+name: v_movereld_b32_e32
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: v_movereld_b32_e32
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $m0 = S_MOV_B32 0
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   V_MOVRELD_B32_e32 $vgpr2, $vgpr1, implicit $m0, implicit $exec, implicit-def $vgpr1_vgpr2, implicit undef $vgpr1_vgpr2(tied-def 4)
+  ; CHECK-NEXT:   $vgpr3 = V_ADD_U32_e32 $vgpr2, $vgpr1, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr3
+  bb.0:
+    liveins: $vgpr0, $vgpr2
+    $m0 = S_MOV_B32 0
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    V_MOVRELD_B32_e32 $vgpr2, $vgpr1, implicit $m0, implicit $exec, implicit-def $vgpr1_vgpr2, implicit undef $vgpr1_vgpr2(tied-def 4)
+    $vgpr3 = V_ADD_U32_e32 $vgpr2, $vgpr1, implicit $exec
+  bb.1:
+    liveins: $vgpr3
+...
+
+# Invalid producers and invalid consumer opcodes.
+---
+name: v_writelane_b32
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: v_writelane_b32
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr1 = V_WRITELANE_B32 $sgpr0, 0, $vgpr1
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr0
+  bb.0:
+    liveins: $vgpr0, $sgpr0
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr1 = V_WRITELANE_B32 $sgpr0, 0, $vgpr1
+    $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec
+  bb.1:
+    liveins: $vgpr0
+...
+
+# DPP instructions cannot be single use producers or consumers
+---
+name: V_ADD_NC_U32_dpp
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: V_ADD_NC_U32_dpp
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vcc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
+  ; CHECK-NEXT:   $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
+  ; CHECK-NEXT:   $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr0
+  bb.0:
+    liveins: $vgpr0, $vcc
+    $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
+    $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
+    $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
+  bb.1:
+    liveins: $vgpr0
+...
+
+# Exception to the rule that dpp instructions
+# cannot be single use producers or consumers
+---
+name: V_INTERP_MOV_F32
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: V_INTERP_MOV_F32
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr0 = V_INTERP_MOV_F32 0, 0, 0, implicit $mode, implicit $m0, implicit $exec
+  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr1
+  bb.0:
+    $vgpr0 = V_INTERP_MOV_F32 0, 0, 0, implicit $mode, implicit $m0, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  bb.1:
+    liveins: $vgpr1
+...
+

>From ddda3e30031c3d534a9b0c0f59ae55680c7adec5 Mon Sep 17 00:00:00 2001
From: Scott Egerton <scott.egerton at amd.com>
Date: Mon, 10 Jun 2024 22:09:08 +0100
Subject: [PATCH 2/3] Convert single use exception definitions to TableGen

Previously invalid single use consumers and invalid single use
producers were listed in a C++ switch statement.
This commit changes this instead be defined in TableGen
alongside the instruction definitions
and accessed via a searchable table.
---
 .../AMDGPU/AMDGPUInsertSingleUseVDST.cpp      | 148 +-----------------
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         |   2 +
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  18 +++
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |   6 +
 llvm/lib/Target/AMDGPU/VOP1Instructions.td    |  18 ++-
 llvm/lib/Target/AMDGPU/VOP2Instructions.td    |   6 +-
 llvm/lib/Target/AMDGPU/VOP3Instructions.td    |  35 +++--
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |  12 +-
 llvm/lib/Target/AMDGPU/VOPCInstructions.td    |  12 +-
 llvm/lib/Target/AMDGPU/VOPInstructions.td     |  22 ++-
 .../CodeGen/AMDGPU/insert-singleuse-vdst.mir  |  29 +++-
 11 files changed, 132 insertions(+), 176 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
index 0571b49ba7700..b42ddd25564c6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
@@ -15,6 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "AMDGPUGenSearchableTables.inc"
 #include "GCNSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
@@ -132,151 +133,12 @@ class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
 
   AMDGPUInsertSingleUseVDST() : MachineFunctionPass(ID) {}
 
-  static bool IsValidOpcode(const MachineInstr &MI) {
-    switch (MI.getOpcode()) {
-    case AMDGPU::V_MOVRELSD_B32_e32:
-    case AMDGPU::V_MOVRELSD_B32_e64:
-    case AMDGPU::V_SWAPREL_B32:
-    case AMDGPU::V_PERMLANE64_B32:
-    case AMDGPU::V_PERMLANE16_B32_e64:
-    case AMDGPU::V_PERMLANE16_B32_gfx10:
-    case AMDGPU::V_PERMLANEX16_B32_e64:
-    case AMDGPU::V_PERMLANEX16_B32_gfx10:
-    case AMDGPU::V_WRITELANE_B32:
-      return false;
-    default:
-      if (SIInstrInfo::isDPP(MI)) {
-        switch (MI.getOpcode()) {
-        case AMDGPU::V_INTERP_MOV_F32:
-        case AMDGPU::V_INTERP_P1_F32_16bank:
-        case AMDGPU::V_INTERP_P1_F32:
-        case AMDGPU::V_INTERP_P2_F32:
-        case AMDGPU::V_INTERP_MOV_F32_e64:
-        case AMDGPU::V_INTERP_P10_F16_F32_inreg:
-        case AMDGPU::V_INTERP_P10_F32_inreg:
-        case AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg:
-        case AMDGPU::V_INTERP_P1_F32_e64_vi:
-        case AMDGPU::V_INTERP_P1LL_F16_vi:
-        case AMDGPU::V_INTERP_P1LV_F16_vi:
-        case AMDGPU::V_INTERP_P2_F16_vi:
-        case AMDGPU::V_INTERP_P2_F16_F32_inreg:
-        case AMDGPU::V_INTERP_P2_F32_inreg:
-        case AMDGPU::V_INTERP_P2_F32_e64:
-        case AMDGPU::V_INTERP_P2_LEGACY_F16_gfx9:
-        case AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg:
-          return true;
-        default:
-          return false;
-        }
-      }
-      return true;
-    }
-  }
-
-  static bool IsValidConsumerOpcode(const MachineInstr &MI) {
-    switch (MI.getOpcode()) {
-    case AMDGPU::V_MOVRELS_B32_e32:
-    case AMDGPU::V_MOVRELS_B32_e64:
-    case AMDGPU::V_READFIRSTLANE_B32:
-    case AMDGPU::V_READLANE_B32:
-    case AMDGPU::V_MAD_I64_I32_vi:
-    case AMDGPU::V_MAD_U64_U32_vi:
-    case AMDGPU::V_ASHRREV_I64_vi:
-    case AMDGPU::V_LSHLREV_B64_vi:
-    case AMDGPU::V_LSHRREV_B64_vi:
-    case AMDGPU::V_MQSAD_PK_U16_U8_vi:
-    case AMDGPU::V_MQSAD_U32_U8_vi:
-    case AMDGPU::V_QSAD_PK_U16_U8_vi:
-    case AMDGPU::V_CMPX_EQ_I64_e32:
-    case AMDGPU::V_CMPX_EQ_I64_e64:
-    case AMDGPU::V_CMPX_EQ_U64_e32:
-    case AMDGPU::V_CMPX_EQ_U64_e64:
-    case AMDGPU::V_CMPX_F_I64_e32:
-    case AMDGPU::V_CMPX_F_I64_e64:
-    case AMDGPU::V_CMPX_F_U64_e32:
-    case AMDGPU::V_CMPX_F_U64_e64:
-    case AMDGPU::V_CMPX_GE_I64_e32:
-    case AMDGPU::V_CMPX_GE_I64_e64:
-    case AMDGPU::V_CMPX_GE_U64_e32:
-    case AMDGPU::V_CMPX_GE_U64_e64:
-    case AMDGPU::V_CMPX_GT_I64_e32:
-    case AMDGPU::V_CMPX_GT_I64_e64:
-    case AMDGPU::V_CMPX_GT_U64_e32:
-    case AMDGPU::V_CMPX_GT_U64_e64:
-    case AMDGPU::V_CMPX_LE_I64_e32:
-    case AMDGPU::V_CMPX_LE_I64_e64:
-    case AMDGPU::V_CMPX_LE_U64_e32:
-    case AMDGPU::V_CMPX_LE_U64_e64:
-    case AMDGPU::V_CMPX_LT_I64_e32:
-    case AMDGPU::V_CMPX_LT_I64_e64:
-    case AMDGPU::V_CMPX_LT_U64_e32:
-    case AMDGPU::V_CMPX_LT_U64_e64:
-    case AMDGPU::V_CMPX_NE_I64_e32:
-    case AMDGPU::V_CMPX_NE_I64_e64:
-    case AMDGPU::V_CMPX_NE_U64_e32:
-    case AMDGPU::V_CMPX_NE_U64_e64:
-    case AMDGPU::V_CMPX_T_I64_e32:
-    case AMDGPU::V_CMPX_T_I64_e64:
-    case AMDGPU::V_CMPX_T_U64_e32:
-    case AMDGPU::V_CMPX_T_U64_e64:
-    case AMDGPU::V_CMP_EQ_I64_e32:
-    case AMDGPU::V_CMP_EQ_I64_e64:
-    case AMDGPU::V_CMP_EQ_U64_e32:
-    case AMDGPU::V_CMP_EQ_U64_e64:
-    case AMDGPU::V_CMP_F_I64_e32:
-    case AMDGPU::V_CMP_F_I64_e64:
-    case AMDGPU::V_CMP_F_U64_e32:
-    case AMDGPU::V_CMP_F_U64_e64:
-    case AMDGPU::V_CMP_GE_I64_e32:
-    case AMDGPU::V_CMP_GE_I64_e64:
-    case AMDGPU::V_CMP_GE_U64_e32:
-    case AMDGPU::V_CMP_GE_U64_e64:
-    case AMDGPU::V_CMP_GT_I64_e32:
-    case AMDGPU::V_CMP_GT_I64_e64:
-    case AMDGPU::V_CMP_GT_U64_e32:
-    case AMDGPU::V_CMP_GT_U64_e64:
-    case AMDGPU::V_CMP_LE_I64_e32:
-    case AMDGPU::V_CMP_LE_I64_e64:
-    case AMDGPU::V_CMP_LE_U64_e32:
-    case AMDGPU::V_CMP_LE_U64_e64:
-    case AMDGPU::V_CMP_LT_I64_e32:
-    case AMDGPU::V_CMP_LT_I64_e64:
-    case AMDGPU::V_CMP_LT_U64_e32:
-    case AMDGPU::V_CMP_LT_U64_e64:
-    case AMDGPU::V_CMP_NE_I64_e32:
-    case AMDGPU::V_CMP_NE_I64_e64:
-    case AMDGPU::V_CMP_NE_U64_e32:
-    case AMDGPU::V_CMP_NE_U64_e64:
-    case AMDGPU::V_CMP_T_I64_e32:
-    case AMDGPU::V_CMP_T_I64_e64:
-    case AMDGPU::V_CMP_T_U64_e32:
-    case AMDGPU::V_CMP_T_U64_e64:
-    case AMDGPU::V_MUL_LO_U32_e64:
-    case AMDGPU::V_MUL_HI_U32_e64:
-    case AMDGPU::V_MUL_HI_I32_e64:
-    case AMDGPU::V_SWAP_B32:
-    case AMDGPU::V_DOT4_I32_I8:
-    case AMDGPU::V_DOT4_U32_U8:
-      return false;
-    default:
-      return IsValidOpcode(MI);
-    }
-  }
-
-  static bool IsValidProducerOpcode(const MachineInstr &MI) {
+  static bool isValidProducerInst(const MachineInstr &MI) {
     // Only VALU instructions are valid producers.
     if (!SIInstrInfo::isVALU(MI))
       return false;
 
-    // VALU instructions that take multiple cycles should not be marked as
-    // single use.
-    switch (MI.getOpcode()) {
-    case AMDGPU::V_MOVRELD_B32_e32:
-    case AMDGPU::V_MOVRELD_B32_e64:
-      return false;
-    default:
-      return IsValidOpcode(MI);
-    }
+    return !AMDGPU::isInvalidSingleUseProducerInst(MI.getOpcode());
   }
 
   void insertSingleUseInstructions(
@@ -362,12 +224,12 @@ class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
 
         // Do not attempt to optimise across exec mask changes.
         if (MI.modifiesRegister(AMDGPU::EXEC, TRI) ||
-            !IsValidConsumerOpcode(MI)) {
+            AMDGPU::isInvalidSingleUseConsumerInst(MI.getOpcode())) {
           for (auto &UsedReg : RegisterUseCount)
             UsedReg.second = 2;
         }
 
-        if (!IsValidProducerOpcode(MI))
+        if (!isValidProducerInst(MI))
           continue;
         if (AllProducerOperandsAreSingleUse) {
           SingleUseProducerPositions.push_back({VALUInstrCount, &MI});
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 0ed2f60ea66a7..f2dd86ec4e711 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2266,6 +2266,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   field bit EnableClamp = _EnableClamp;
   field bit IsTrue16 = 0;
   field bit IsRealTrue16 = 0;
+  field bit IsInvalidSingleUseConsumer = 0;
+  field bit IsInvalidSingleUseProducer = 0;
 
   field ValueType DstVT = ArgVT[0];
   field ValueType Src0VT = ArgVT[1];
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 2beaf903542bd..e6ab86bccdd3e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -372,10 +372,18 @@ struct VOPTrue16Info {
   bool IsTrue16;
 };
 
+struct SingleUseExceptionInfo {
+  uint16_t Opcode;
+  bool IsInvalidSingleUseConsumer;
+  bool IsInvalidSingleUseProducer;
+};
+
 #define GET_MTBUFInfoTable_DECL
 #define GET_MTBUFInfoTable_IMPL
 #define GET_MUBUFInfoTable_DECL
 #define GET_MUBUFInfoTable_IMPL
+#define GET_SingleUseExceptionTable_DECL
+#define GET_SingleUseExceptionTable_IMPL
 #define GET_SMInfoTable_DECL
 #define GET_SMInfoTable_IMPL
 #define GET_VOP1InfoTable_DECL
@@ -607,6 +615,16 @@ bool isTrue16Inst(unsigned Opc) {
   return Info ? Info->IsTrue16 : false;
 }
 
+bool isInvalidSingleUseConsumerInst(unsigned Opc) {
+  const SingleUseExceptionInfo *Info = getSingleUseExceptionHelper(Opc);
+  return Info ? Info->IsInvalidSingleUseConsumer : false;
+}
+
+bool isInvalidSingleUseProducerInst(unsigned Opc) {
+  const SingleUseExceptionInfo *Info = getSingleUseExceptionHelper(Opc);
+  return Info ? Info->IsInvalidSingleUseProducer : false;
+}
+
 unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) {
   const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc);
   return Info ? Info->Opcode3Addr : ~0u;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index fc4147df76e3e..7478ef0ffad86 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -854,6 +854,12 @@ getVOPDInstInfo(unsigned VOPDOpcode, const MCInstrInfo *InstrInfo);
 LLVM_READONLY
 bool isTrue16Inst(unsigned Opc);
 
+LLVM_READONLY
+bool isInvalidSingleUseConsumerInst(unsigned Opc);
+
+LLVM_READONLY
+bool isInvalidSingleUseProducerInst(unsigned Opc);
+
 LLVM_READONLY
 unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc);
 
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 012dca22eb4fe..12ee8e12ec5e3 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -246,6 +246,7 @@ def V_READFIRSTLANE_B32 : VOP1_Pseudo <"v_readfirstlane_b32", VOP_READFIRSTLANE,
                                        getVOP1Pat<int_amdgcn_readfirstlane,
                                                   VOP_READFIRSTLANE>.ret, 1> {
   let isConvergent = 1;
+  let IsInvalidSingleUseConsumer = 1;
 }
 
 let isReMaterializable = 1 in {
@@ -356,6 +357,7 @@ defm V_CLREXCP : VOP1Inst <"v_clrexcp", VOP_NO_EXT<VOP_NONE>>;
 def VOP_MOVRELS : VOPProfile<[i32, i32, untyped, untyped]> {
   let Src0RC32 = VRegSrc_32;
   let Src0RC64 = VRegSrc_32;
+  let IsInvalidSingleUseConsumer = 1;
 }
 
 // Special case because there are no true output operands.  Hack vdst
@@ -399,8 +401,12 @@ class VOP_MOVREL<RegisterOperand Src1RC> : VOPProfile<[untyped, i32, untyped, un
   let EmitDst = 1; // force vdst emission
 }
 
-def VOP_MOVRELD : VOP_MOVREL<VSrc_b32>;
-def VOP_MOVRELSD : VOP_MOVREL<VRegSrc_32>;
+let IsInvalidSingleUseProducer = 1 in {
+  def VOP_MOVRELD : VOP_MOVREL<VSrc_b32>;
+  def VOP_MOVRELSD : VOP_MOVREL<VRegSrc_32> {
+    let IsInvalidSingleUseConsumer = 1;
+  }
+}
 
 let SubtargetPredicate = HasMovrel, Uses = [M0, EXEC] in {
  // v_movreld_b32 is a special case because the destination output
@@ -529,6 +535,7 @@ let SubtargetPredicate = isGFX9Plus in {
     let Constraints = "$vdst = $src1, $vdst1 = $src0";
     let DisableEncoding = "$vdst1,$src1";
     let SchedRW = [Write64Bit, Write64Bit];
+    let IsInvalidSingleUseConsumer = 1;
   }
 
   let isReMaterializable = 1 in
@@ -693,6 +700,8 @@ let SubtargetPredicate = isGFX10Plus in {
       let Constraints = "$vdst = $src1, $vdst1 = $src0";
       let DisableEncoding = "$vdst1,$src1";
       let SchedRW = [Write64Bit, Write64Bit];
+      let IsInvalidSingleUseConsumer = 1;
+      let IsInvalidSingleUseProducer = 1;
     }
   } // End Uses = [M0]
 } // End SubtargetPredicate = isGFX10Plus
@@ -714,7 +723,10 @@ let SubtargetPredicate = isGFX11Plus in {
   def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS,
                                       getVOP1Pat<int_amdgcn_permlane64,
                                                  VOP_MOVRELS>.ret,
-                                      /*VOP1Only=*/ 1>;
+                                      /*VOP1Only=*/ 1> {
+    let IsInvalidSingleUseConsumer = 1;
+    let IsInvalidSingleUseProducer = 1;
+  }
   defm V_MOV_B16_t16    : VOP1Inst<"v_mov_b16_t16", VOPProfile_True16<VOP_I16_I16>>;
   defm V_NOT_B16        : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>;
   defm V_CVT_I32_I16    : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>;
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index d2af1753d5503..4d3411d16a383 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -779,12 +779,14 @@ defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag,
 } // End isCommutable = 1
 
 // These are special and do not read the exec mask.
-let isConvergent = 1, Uses = []<Register> in {
+let isConvergent = 1, Uses = []<Register>, IsInvalidSingleUseConsumer = 1 in {
 def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
   [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))]>;
 let IsNeverUniform = 1, Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
 def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE,
-  [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))]>;
+  [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))]> {
+    let IsInvalidSingleUseProducer = 1;
+  }
 } // End IsNeverUniform, $vdst = $vdst_in, DisableEncoding $vdst_in
 } // End isConvergent = 1
 
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 616bc7684753e..8fb99c0139f2f 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -156,10 +156,12 @@ defm V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_l
 } // End SchedRW = [WriteDoubleAdd]
 
 let SchedRW = [WriteIntMul] in {
+defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", V_MUL_PROF<VOP_I32_I32_I32>>;
+let IsInvalidSingleUseConsumer = 1 in {
 defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", V_MUL_PROF<VOP_I32_I32_I32>, DivergentBinFrag<mul>>;
 defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", V_MUL_PROF<VOP_I32_I32_I32>, mulhu>;
-defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", V_MUL_PROF<VOP_I32_I32_I32>>;
 defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", V_MUL_PROF<VOP_I32_I32_I32>, mulhs>;
+} // End IsInvalidSingleUseConsumer = 1
 } // End SchedRW = [WriteIntMul]
 
 let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
@@ -258,9 +260,9 @@ let mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it d
 let isReMaterializable = 1 in
 defm V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
 
-let Constraints = "@earlyclobber $vdst" in {
+let Constraints = "@earlyclobber $vdst", IsInvalidSingleUseConsumer = 1 in {
 defm V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
-} // End Constraints = "@earlyclobber $vdst"
+} // End Constraints = "@earlyclobber $vdst", IsInvalidSingleUseConsumer = 1
 
 
 let isReMaterializable = 1 in {
@@ -275,14 +277,16 @@ let SchedRW = [Write64Bit] in {
   defm V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, csra_64>;
   } // End SubtargetPredicate = isGFX6GFX7
 
+  let IsInvalidSingleUseConsumer = 1 in {
   let SubtargetPredicate = isGFX8Plus in {
   defm V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>, clshr_rev_64>;
   defm V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, cashr_rev_64>;
-  } // End SubtargetPredicate = isGFX8Plus
+  } // End SubtargetPredicate = isGFX8Plus, , IsInvalidSingleUseConsumer = 1
 
   let SubtargetPredicate = isGFX8GFX9GFX10GFX11 in {
   defm V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, clshl_rev_64>;
   } // End SubtargetPredicate = isGFX8GFX9GFX10GFX11
+  } // End IsInvalidSingleUseConsumer = 1
 } // End SchedRW = [Write64Bit]
 } // End isReMaterializable = 1
 
@@ -307,14 +311,14 @@ def VOPProfileMQSAD : VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP> {
   let HasModifiers = 0;
 }
 
-let SubtargetPredicate = isGFX7Plus in {
+let SubtargetPredicate = isGFX7Plus, IsInvalidSingleUseConsumer = 1 in {
 let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in {
 defm V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
 defm V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOPProfileMQSAD>;
 } // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32]
-} // End SubtargetPredicate = isGFX7Plus
+} // End SubtargetPredicate = isGFX7Plus, IsInvalidSingleUseConsumer = 1
 
-let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] in {
+let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU], IsInvalidSingleUseConsumer = 1 in {
   let SubtargetPredicate = isGFX7Plus, OtherPredicates = [HasNotMADIntraFwdBug] in {
     defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
     defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
@@ -324,7 +328,7 @@ let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] in {
     defm V_MAD_U64_U32_gfx11 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
     defm V_MAD_I64_I32_gfx11 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
   }
-} // End isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU]
+} // End isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU], IsInvalidSingleUseConsumer = 1
 
 
 let FPDPRounding = 1 in {
@@ -859,10 +863,10 @@ let SubtargetPredicate = isGFX10Plus in {
   } // End isCommutable = 1, isReMaterializable = 1
   def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32_e64>;
 
-  let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
+  let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in", IsInvalidSingleUseConsumer = 1, IsInvalidSingleUseProducer = 1 in {
     defm V_PERMLANE16_B32 : VOP3Inst<"v_permlane16_b32", VOP3_PERMLANE_Profile>;
     defm V_PERMLANEX16_B32 : VOP3Inst<"v_permlanex16_b32", VOP3_PERMLANE_Profile>;
-  } // End $vdst = $vdst_in, DisableEncoding $vdst_in
+  } // End $vdst = $vdst_in, DisableEncoding $vdst_in, IsInvalidSingleUseConsumer = 1, IsInvalidSingleUseProducer = 1
 
   def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64>;
   def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64>;
@@ -1275,11 +1279,12 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
   }
 } // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
 
-defm V_READLANE_B32  : VOP3_Real_No_Suffix_gfx10<0x360>;
-
-let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in {
-  defm V_WRITELANE_B32 : VOP3_Real_No_Suffix_gfx10<0x361>;
-} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in)
+let IsInvalidSingleUseConsumer = 1 in {
+  defm V_READLANE_B32  : VOP3_Real_No_Suffix_gfx10<0x360>;
+  let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in), IsInvalidSingleUseProducer = 1 in {
+    defm V_WRITELANE_B32 : VOP3_Real_No_Suffix_gfx10<0x361>;
+  } // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32: $src1, VGPR_32:$vdst_in), IsInvalidSingleUseProducer = 1
+} // End IsInvalidSingleUseConsumer = 1
 
 let SubtargetPredicate = isGFX10Before1030 in {
   defm V_MUL_LO_I32      : VOP3_Real_gfx10<0x16b>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index c3bdbbfc38462..310ad3d731f1d 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -382,15 +382,19 @@ defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16",
   AMDGPUfdot2, 1/*ExplicitClamp*/>;
 
 let OtherPredicates = [HasDot7Insts] in {
-defm V_DOT4_U32_U8  : VOP3PInst<"v_dot4_u32_u8",
-  VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
+let IsInvalidSingleUseConsumer = 1 in {
+  defm V_DOT4_U32_U8  : VOP3PInst<"v_dot4_u32_u8",
+    VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
+}
 defm V_DOT8_U32_U4  : VOP3PInst<"v_dot8_u32_u4",
   VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>;
 } // End OtherPredicates = [HasDot7Insts]
 
 let OtherPredicates = [HasDot1Insts] in {
-defm V_DOT4_I32_I8  : VOP3PInst<"v_dot4_i32_i8",
-  VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
+let IsInvalidSingleUseConsumer = 1 in {
+  defm V_DOT4_I32_I8  : VOP3PInst<"v_dot4_i32_i8",
+    VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
+}
 defm V_DOT8_I32_I4  : VOP3PInst<"v_dot8_i32_i4",
   VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>;
 } // End OtherPredicates = [HasDot1Insts]
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index ddd6d8b074aa3..33dddf850ad21 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -435,8 +435,10 @@ multiclass VOPC_I16 <string opName, SDPatternOperator cond = COND_NULL,
 multiclass VOPC_I32 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
   VOPC_Pseudos <opName, VOPC_I1_I32_I32, cond, revOp, 0>;
 
-multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
-  VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>;
+let IsInvalidSingleUseConsumer = 1 in {
+  multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
+    VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>;
+}
 
 multiclass VOPCX_F16<string opName, string revOp = opName> {
   let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
@@ -465,8 +467,10 @@ multiclass VOPCX_I16<string opName, string revOp = opName> {
 multiclass VOPCX_I32 <string opName, string revOp = opName> :
   VOPCX_Pseudos <opName, VOPC_I1_I32_I32, VOPC_I32_I32, COND_NULL, revOp>;
 
-multiclass VOPCX_I64 <string opName, string revOp = opName> :
-  VOPCX_Pseudos <opName, VOPC_I1_I64_I64, VOPC_I64_I64, COND_NULL, revOp>;
+let IsInvalidSingleUseConsumer = 1 in {
+  multiclass VOPCX_I64 <string opName, string revOp = opName> :
+    VOPCX_Pseudos <opName, VOPC_I1_I64_I64, VOPC_I64_I64, COND_NULL, revOp>;
+}
 
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index f45ab9bf46db1..c8f794322b677 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -17,6 +17,8 @@ class LetDummies {
   bit isReMaterializable;
   bit isAsCheapAsAMove;
   bit FPDPRounding;
+  bit IsInvalidSingleUseConsumer;
+  bit IsInvalidSingleUseProducer;
   Predicate SubtargetPredicate;
   string Constraints;
   string DisableEncoding;
@@ -81,6 +83,8 @@ class VOP_Pseudo <string opName, string suffix, VOPProfile P, dag outs, dag ins,
   string Mnemonic = opName;
   Instruction Opcode = !cast<Instruction>(NAME);
   bit IsTrue16 = P.IsTrue16;
+  bit IsInvalidSingleUseConsumer = P.IsInvalidSingleUseConsumer;
+  bit IsInvalidSingleUseProducer = P.IsInvalidSingleUseProducer;
   VOPProfile Pfl = P;
 
   string AsmOperands;
@@ -175,6 +179,8 @@ class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> :
 class VOP_Real<VOP_Pseudo ps> {
   Instruction Opcode = !cast<Instruction>(NAME);
   bit IsSingle = ps.Pfl.IsSingle;
+  bit IsInvalidSingleUseConsumer = ps.Pfl.IsInvalidSingleUseConsumer;
+  bit IsInvalidSingleUseProducer = ps.Pfl.IsInvalidSingleUseProducer;
 }
 
 class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemonic> :
@@ -819,9 +825,7 @@ class VOP3P_DPPe_Common<bits<7> op, VOPProfile P> : VOP3P_DPPe_Common_Base<op, P
 
 class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
   dag Ins = P.InsDPP, string asmOps = P.AsmDPP> :
-  InstSI <P.OutsDPP, Ins, OpName#asmOps, pattern>,
-  VOP <OpName>,
-  SIMCInstr <OpName#"_dpp", SIEncodingFamily.NONE> {
+  VOP_Pseudo<OpName, "_dpp", P, P.OutsDPP, Ins, asmOps, pattern> {
 
   let isPseudo = 1;
   let isCodeGenOnly = 1;
@@ -853,6 +857,9 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
   let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, "");
   let DecoderNamespace = "GFX8";
 
+  let IsInvalidSingleUseConsumer = !not(VINTERP);
+  let IsInvalidSingleUseProducer = !not(VINTERP);
+
   VOPProfile Pfl = P;
 }
 
@@ -1719,3 +1726,12 @@ def VOPTrue16Table : GenericTable {
   let PrimaryKey = ["Opcode"];
   let PrimaryKeyName = "getTrue16OpcodeHelper";
 }
+
+def SingleUseExceptionTable : GenericTable {
+  let FilterClass = "VOP_Pseudo";
+  let CppTypeName = "SingleUseExceptionInfo";
+  let Fields = ["Opcode", "IsInvalidSingleUseConsumer", "IsInvalidSingleUseProducer"];
+
+  let PrimaryKey = ["Opcode"];
+  let PrimaryKeyName = "getSingleUseExceptionHelper";
+}
diff --git a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
index 129e577fb8a5a..154036036b32a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
@@ -37,14 +37,14 @@ body: |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
   ; CHECK-NEXT:   $vgpr2_vgpr3 = V_LSHLREV_B64_e64 0, $vgpr0_vgpr1, implicit $exec
-  ; CHECK-NEXT:   $vgpr4_vgpr5 = V_LSHLREV_B64_e64 0, $vgpr2_vgpr3, implicit $exec
+  ; CHECK-NEXT:   $vgpr4_vgpr5 = V_MOV_B64_e64 $vgpr2_vgpr3, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   liveins: $vgpr4_vgpr5
   bb.0:
     liveins: $vgpr0_vgpr1
     $vgpr2_vgpr3 = V_LSHLREV_B64_e64 0, $vgpr0_vgpr1, implicit $exec
-    $vgpr4_vgpr5 = V_LSHLREV_B64_e64 0, $vgpr2_vgpr3, implicit $exec
+    $vgpr4_vgpr5 = V_MOV_B64_e64 $vgpr2_vgpr3, implicit $exec
   bb.1:
     liveins: $vgpr4_vgpr5
 ...
@@ -1291,6 +1291,31 @@ body: |
     liveins: $vgpr0
 ...
 
+---
+name: v_lshlrev_b64_e64
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: v_lshlrev_b64_e64
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0_vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vgpr2_vgpr3 = V_MOV_B64_e64 $vgpr0_vgpr1, implicit $exec
+  ; CHECK-NEXT:   $vgpr4_vgpr5 = V_LSHLREV_B64_e64 0, $vgpr2_vgpr3, implicit $exec
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
+  ; CHECK-NEXT:   $vgpr6_vgpr7 = V_LSHLREV_B64_e64 0, $vgpr4_vgpr5, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr4_vgpr5
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    $vgpr2_vgpr3 = V_MOV_B64_e64 $vgpr0_vgpr1, implicit $exec
+    $vgpr4_vgpr5 = V_LSHLREV_B64_e64 0, $vgpr2_vgpr3, implicit $exec
+    $vgpr6_vgpr7 = V_LSHLREV_B64_e64 0, $vgpr4_vgpr5, implicit $exec
+  bb.1:
+    liveins: $vgpr4_vgpr5
+...
+
 # Invalid producers but valid consumer opcodes.
 ---
 name: v_movereld_b32_e32

>From d36e71727a937f97ac54516b21c6010e25cce2e0 Mon Sep 17 00:00:00 2001
From: Scott Egerton <scott.egerton at amd.com>
Date: Tue, 11 Jun 2024 21:21:56 +0100
Subject: [PATCH 3/3] fixup! Convert single use exception definitions to
 TableGen

---
 llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp | 11 ++---------
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp      |  4 ++--
 llvm/lib/Target/AMDGPU/VOP3Instructions.td           |  8 +++-----
 llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir   |  2 +-
 4 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
index b42ddd25564c6..43b3bf43fe56d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
@@ -133,14 +133,6 @@ class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
 
   AMDGPUInsertSingleUseVDST() : MachineFunctionPass(ID) {}
 
-  static bool isValidProducerInst(const MachineInstr &MI) {
-    // Only VALU instructions are valid producers.
-    if (!SIInstrInfo::isVALU(MI))
-      return false;
-
-    return !AMDGPU::isInvalidSingleUseProducerInst(MI.getOpcode());
-  }
-
   void insertSingleUseInstructions(
       ArrayRef<std::pair<unsigned, MachineInstr *>> SingleUseProducers) const {
     SmallVector<SingleUseInstruction> Instructions;
@@ -229,7 +221,8 @@ class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
             UsedReg.second = 2;
         }
 
-        if (!isValidProducerInst(MI))
+        if (!SIInstrInfo::isVALU(MI) ||
+            AMDGPU::isInvalidSingleUseProducerInst(MI.getOpcode()))
           continue;
         if (AllProducerOperandsAreSingleUse) {
           SingleUseProducerPositions.push_back({VALUInstrCount, &MI});
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index e6ab86bccdd3e..7b841888cba1d 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -617,12 +617,12 @@ bool isTrue16Inst(unsigned Opc) {
 
 bool isInvalidSingleUseConsumerInst(unsigned Opc) {
   const SingleUseExceptionInfo *Info = getSingleUseExceptionHelper(Opc);
-  return Info ? Info->IsInvalidSingleUseConsumer : false;
+  return Info && Info->IsInvalidSingleUseConsumer;
 }
 
 bool isInvalidSingleUseProducerInst(unsigned Opc) {
   const SingleUseExceptionInfo *Info = getSingleUseExceptionHelper(Opc);
-  return Info ? Info->IsInvalidSingleUseProducer : false;
+  return Info && Info->IsInvalidSingleUseProducer;
 }
 
 unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) {
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 8fb99c0139f2f..fd5f1b71ce331 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -155,14 +155,12 @@ defm V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_l
 } // End SubtargetPredicate = isNotGFX12Plus
 } // End SchedRW = [WriteDoubleAdd]
 
-let SchedRW = [WriteIntMul] in {
-defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", V_MUL_PROF<VOP_I32_I32_I32>>;
-let IsInvalidSingleUseConsumer = 1 in {
+let SchedRW = [WriteIntMul], IsInvalidSingleUseConsumer = 1 in {
 defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", V_MUL_PROF<VOP_I32_I32_I32>, DivergentBinFrag<mul>>;
 defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", V_MUL_PROF<VOP_I32_I32_I32>, mulhu>;
+defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", V_MUL_PROF<VOP_I32_I32_I32>>;
 defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", V_MUL_PROF<VOP_I32_I32_I32>, mulhs>;
-} // End IsInvalidSingleUseConsumer = 1
-} // End SchedRW = [WriteIntMul]
+} // End SchedRW = [WriteIntMul], IsInvalidSingleUseConsumer = 1
 
 let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
 defm V_MINIMUM_F32 : VOP3Inst <"v_minimum_f32", VOP3_Profile<VOP_F32_F32_F32>, DivergentBinFrag<fminimum>>;
diff --git a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
index 154036036b32a..9e65ce329df43 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass=amdgpu-insert-single-use-vdst %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs -run-pass=amdgpu-insert-single-use-vdst %s -o - | FileCheck %s
 
 # One single-use producer.
 ---



More information about the llvm-commits mailing list