[llvm] [AMDGPU] Correctly insert s_nops for dst forwarding hazard (PR #100276)

Jeffrey Byrnes via llvm-commits llvm-commits at lists.llvm.org
Wed Aug 21 16:02:46 PDT 2024


https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/100276

>From 92f9ef7f238ed8e2d06f148fe6e7607fdd573794 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 23 Jul 2024 15:14:14 -0700
Subject: [PATCH 01/17] [AMDGPU] Correctly insert s_nops for implicit read of
 SDWA

Change-Id: I4e22bb3764705f328827eb64704720a0d6aa1a9b
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 24 +++++++++
 .../AMDGPU/sdwa-dst-preserve-hazard.mir       | 54 +++++++++++++++++++
 2 files changed, 78 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-dst-preserve-hazard.mir

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index a402fc6d7e6110..570fcf63587fd6 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -935,6 +935,30 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
           if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
             return true;
         }
+
+        // If the non-impacted bits of a sdwa dst are preserved, then we
+        // read them and must resolve the hazard. SDWA dst preserve is modelled
+        // as a tied-def implicit use
+        if (SIInstrInfo::isSDWA(*VALU)) {
+          if (auto *DstSel =
+                  TII->getNamedOperand(*VALU, AMDGPU::OpName::dst_sel)) {
+            if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
+              return false;
+            if (auto *ThisDst =
+                    TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
+              if (!TRI->regsOverlap(Def, ThisDst->getReg()))
+                return false;
+              for (const MachineOperand &ImplicitOp :
+                   VALU->implicit_operands()) {
+                if (ImplicitOp.isDef())
+                  continue;
+                if (ImplicitOp.isReg() &&
+                    TRI->regsOverlap(Def, ImplicitOp.getReg()))
+                  return true;
+              }
+            }
+          }
+        }
       }
 
       return false;
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-dst-preserve-hazard.mir b/llvm/test/CodeGen/AMDGPU/sdwa-dst-preserve-hazard.mir
new file mode 100644
index 00000000000000..ec3b61c4776eef
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-dst-preserve-hazard.mir
@@ -0,0 +1,54 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=HAZARD %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=NOHAZARD %s
+
+---
+name:            hazard_vcmpx_sdwa_permlane16
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr5, $vgpr6
+    ; HAZARD-LABEL: name: hazard_vcmpx_sdwa_permlane16
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr5, $vgpr6
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr7 = COPY $vgpr6, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr6 = COPY $vgpr5, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr4 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr2_vgpr3, 0, 0, implicit $exec
+    ; HAZARD-NEXT: KILL killed renamable $vgpr2, renamable $vgpr3
+    ; HAZARD-NEXT: KILL killed renamable $vgpr0, renamable $vgpr1
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 1, 0, 3, 3, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr1 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 6, 0, 5, 5, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = V_OR_B32_sdwa 0, killed $vgpr1, 0, killed $vgpr0, 0, 5, 0, 0, 6, implicit $exec
+    ; HAZARD-NEXT: S_NOP 0
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, killed $vgpr4, 0, killed $vgpr5, 0, 1, 2, 6, 1, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; HAZARD-NEXT: GLOBAL_STORE_DWORD killed renamable $vgpr6_vgpr7, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: hazard_vcmpx_sdwa_permlane16
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr5, $vgpr6
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr7 = COPY $vgpr6, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr6 = COPY $vgpr5, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr4 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr2_vgpr3, 0, 0, implicit $exec
+    ; NOHAZARD-NEXT: KILL killed renamable $vgpr2, renamable $vgpr3
+    ; NOHAZARD-NEXT: KILL killed renamable $vgpr0, renamable $vgpr1
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 1, 0, 3, 3, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr1 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 6, 0, 5, 5, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_OR_B32_sdwa 0, killed $vgpr1, 0, killed $vgpr0, 0, 5, 0, 0, 6, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, killed $vgpr4, 0, killed $vgpr5, 0, 1, 2, 6, 1, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; NOHAZARD-NEXT: GLOBAL_STORE_DWORD killed renamable $vgpr6_vgpr7, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr7 = COPY $vgpr6, implicit $exec
+  renamable $vgpr6 = COPY $vgpr5, implicit $exec
+  renamable $vgpr4 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+  renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr2_vgpr3, 0, 0, implicit $exec
+  KILL killed renamable $vgpr2, renamable $vgpr3
+  KILL killed renamable $vgpr0, renamable $vgpr1
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 1, 0, 3, 3, implicit $exec
+  renamable $vgpr1 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 6, 0, 5, 5, implicit $exec
+  renamable $vgpr0 = V_OR_B32_sdwa 0, killed $vgpr1, 0, killed $vgpr0, 0, 5, 0, 0, 6, implicit $exec
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, killed $vgpr4, 0, killed $vgpr5, 0, 1, 2, 6, 1, implicit $exec, implicit killed $vgpr0(tied-def 0)
+  GLOBAL_STORE_DWORD killed renamable $vgpr6_vgpr7, killed renamable $vgpr0, 0, 0, implicit $exec
+  S_ENDPGM 0
+...

>From c6b2c2c0bbcf8d3ae9f1b633db37d7b0fa51ee1a Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 24 Jul 2024 12:36:40 -0700
Subject: [PATCH 02/17] Handle opsel case + address review comments

Change-Id: I131dbb24762a53f31af0aa685c48b6c14233bf3d
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp |  36 +++---
 llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir   | 103 ++++++++++++++++++
 2 files changed, 116 insertions(+), 23 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 570fcf63587fd6..66349e9527725f 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -875,6 +875,7 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
     return DataIdx >= 0 &&
            TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
   };
+
   int WaitStatesNeededForDef =
     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
@@ -931,33 +932,22 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
       if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
         Register Def = Dst->getReg();
 
-        for (const MachineOperand &Use : VALU->explicit_uses()) {
+        for (const MachineOperand &Use : VALU->all_uses()) {
           if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
             return true;
         }
 
-        // If the non-impacted bits of a sdwa dst are preserved, then we
-        // read them and must resolve the hazard. SDWA dst preserve is modelled
-        // as a tied-def implicit use
-        if (SIInstrInfo::isSDWA(*VALU)) {
-          if (auto *DstSel =
-                  TII->getNamedOperand(*VALU, AMDGPU::OpName::dst_sel)) {
-            if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
-              return false;
-            if (auto *ThisDst =
-                    TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
-              if (!TRI->regsOverlap(Def, ThisDst->getReg()))
-                return false;
-              for (const MachineOperand &ImplicitOp :
-                   VALU->implicit_operands()) {
-                if (ImplicitOp.isDef())
-                  continue;
-                if (ImplicitOp.isReg() &&
-                    TRI->regsOverlap(Def, ImplicitOp.getReg()))
-                  return true;
-              }
-            }
-          }
+        // We also read the dst for sub 32 writes to the same register for ECC
+        if (auto *ThisDst = TII->getNamedOperand(*VALU, AMDGPU::OpName::vdst)) {
+          Register ThisDef = ThisDst->getReg();
+          if (!TRI->regsOverlap(Def, ThisDef))
+            return false;
+          if (AMDGPU::hasNamedOperand(VALU->getOpcode(),
+                                      AMDGPU::OpName::op_sel) &&
+              TII->getNamedOperand(*VALU, AMDGPU::OpName::src0_modifiers)
+                      ->getImm() &
+                  SISrcMods::DST_OP_SEL)
+            return true;
         }
       }
 
diff --git a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
new file mode 100644
index 00000000000000..a8a26b49a57e98
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
@@ -0,0 +1,103 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=HAZARD %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=NOHAZARD %s
+
+---
+name:            sdwa_opsel_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: sdwa_opsel_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+    ; HAZARD-NEXT: S_NOP 0
+    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: sdwa_opsel_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            opsel_sdwa_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: opsel_sdwa_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: S_NOP 0
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: opsel_sdwa_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+  S_ENDPGM 0
+...
+
+---
+name:            opsel_opsel_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: opsel_opsel_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: S_NOP 0
+    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: opsel_opsel_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            sdwa_sdwa_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: sdwa_sdwa_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+    ; HAZARD-NEXT: S_NOP 0
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: sdwa_sdwa_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+  S_ENDPGM 0
+...

>From 86c8316e99a9e5ccdca8d32e2d92890062abafd2 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 24 Jul 2024 13:59:08 -0700
Subject: [PATCH 03/17] Add minimal support for INLINEASM correctness

Change-Id: Icbe306650f5513787db77acd49eedbb08d8f2149
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp |  70 ++++--
 llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir   | 221 ++++++++++++++++++
 2 files changed, 275 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 66349e9527725f..c87cdd043b8899 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -883,6 +883,39 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
   return WaitStatesNeeded;
 }
 
+static const MachineOperand *
+getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  if (SIInstrInfo::isVALU(MI) && SIInstrInfo::isSDWA(MI)) {
+    if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
+      if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
+        return nullptr;
+  } else if (SIInstrInfo::isVALU(MI)) {
+    if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) ||
+        !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
+          SISrcMods::DST_OP_SEL))
+      return nullptr;
+  }
+
+  const MachineOperand *Dst = nullptr;
+
+  if (SIInstrInfo::isVALU(MI))
+    Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+
+  // Assume inline asm has dst forwarding hazard
+  else if (MI.isInlineAsm()) {
+    for (auto &Op :
+         llvm::drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) {
+      if (Op.isReg() && Op.isDef()) {
+        Dst = &Op;
+        break;
+      }
+    }
+  }
+
+  return Dst;
+}
+
 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
   int WaitStatesNeeded = 0;
 
@@ -914,22 +947,10 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
     const int Shift16DefWaitstates = 1;
 
     auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
-      if (!SIInstrInfo::isVALU(MI))
-        return false;
       const SIInstrInfo *TII = ST.getInstrInfo();
-      if (SIInstrInfo::isSDWA(MI)) {
-        if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
-          if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
-            return false;
-      } else {
-        if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) ||
-            !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)
-                  ->getImm() &
-              SISrcMods::DST_OP_SEL))
-          return false;
-      }
       const SIRegisterInfo *TRI = ST.getRegisterInfo();
-      if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
+      const MachineOperand *Dst = getDstSelForwardingOperand(MI, ST);
+      if (Dst) {
         Register Def = Dst->getReg();
 
         for (const MachineOperand &Use : VALU->all_uses()) {
@@ -950,7 +971,6 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
             return true;
         }
       }
-
       return false;
     };
 
@@ -1043,7 +1063,7 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
   // problematic thus far.
 
   // see checkVALUHazards()
-  if (!ST.has12DWordStoreHazard())
+  if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard())
     return 0;
 
   const MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -1054,6 +1074,24 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
     if (Op.isReg() && Op.isDef()) {
       WaitStatesNeeded =
           std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
+
+      if (!TRI.isVectorRegister(MRI, Op.getReg()))
+        continue;
+
+      if (ST.hasDstSelForwardingHazard()) {
+        const int Shift16DefWaitstates = 1;
+
+        auto IsShift16BitDefFn = [this](const MachineInstr &MI) {
+          const MachineOperand *Dst = getDstSelForwardingOperand(MI, ST);
+          // Assume inline asm reads the dst
+          return Dst ? true : false;
+        };
+
+        int WaitStatesNeededForDef =
+            Shift16DefWaitstates -
+            getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
+        WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+      }
     }
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
index a8a26b49a57e98..55dfa8cb640091 100644
--- a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
@@ -27,6 +27,30 @@ body:            |
   S_ENDPGM 0
 ...
 
+---
+name:            sdwa_no_opsel_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: sdwa_no_opsel_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: sdwa_no_opsel_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
 ---
 name:            opsel_sdwa_hazard
 body:            |
@@ -52,6 +76,54 @@ body:            |
   S_ENDPGM 0
 ...
 
+---
+name:            opsel_no_sdwa_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: opsel_no_sdwa_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: opsel_no_sdwa_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            no_opsel_sdwa_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: no_opsel_sdwa_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: no_opsel_sdwa_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+  S_ENDPGM 0
+...
+
 ---
 name:            opsel_opsel_hazard
 body:            |
@@ -77,6 +149,54 @@ body:            |
   S_ENDPGM 0
 ...
 
+---
+name:            opsel_no_opsel_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: opsel_no_opsel_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: opsel_no_opsel_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            no_opsel_opsel_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: no_opsel_opsel_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: no_opsel_opsel_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
 ---
 name:            sdwa_sdwa_hazard
 body:            |
@@ -101,3 +221,104 @@ body:            |
   renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
   S_ENDPGM 0
 ...
+
+---
+name:            sdwa_nosdwa_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: sdwa_nosdwa_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: sdwa_nosdwa_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            inline_sdwa_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: inline_sdwa_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1
+    ; HAZARD-NEXT: S_NOP 0
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: inline_sdwa_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  INLINEASM &"v_or_b32 %0, 0, %1", 32, 327690, def $vgpr0, 327689, $vgpr1
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+  S_ENDPGM 0
+...
+
+---
+name:            sdwa_inline_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: sdwa_inline_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; HAZARD-NEXT: S_NOP 0
+    ; HAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: sdwa_inline_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+  INLINEASM &"v_or_b32 %0, 0, %1", 32, 327690, def $vgpr0, 327689, $vgpr1
+  S_ENDPGM 0
+...
+
+
+---
+name:            inline_inline_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: inline_inline_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1
+    ; HAZARD-NEXT: S_NOP 0
+    ; HAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: inline_inline_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1
+    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  INLINEASM &"v_or_b32 %0, 0, %1", 32, 327690, def $vgpr0, 327689, $vgpr1
+  INLINEASM &"v_or_b32 %0, 0, %1", 32, 327690, def $vgpr0, 327689, $vgpr1
+  S_ENDPGM 0
+...
+

>From b292ded9d932d9724fe63541d303e40f95fe66a4 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 24 Jul 2024 15:03:11 -0700
Subject: [PATCH 04/17] Review comments

Change-Id: Ib799d777e86306ce2ad4d74c421123e4420668e4
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 37 ++++++-------
 .../AMDGPU/sdwa-dst-preserve-hazard.mir       | 54 -------------------
 2 files changed, 17 insertions(+), 74 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-dst-preserve-hazard.mir

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index c87cdd043b8899..e20b1b54d5b017 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -885,35 +885,32 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
 
 static const MachineOperand *
 getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
+  // Assume inline asm has dst forwarding hazard
+  if (MI.isInlineAsm()) {
+    for (auto &Op :
+         llvm::drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) {
+      if (Op.isReg() && Op.isDef()) {
+        return &Op;
+      }
+    }
+  }
+
   const SIInstrInfo *TII = ST.getInstrInfo();
-  if (SIInstrInfo::isVALU(MI) && SIInstrInfo::isSDWA(MI)) {
+  if (!SIInstrInfo::isVALU(MI))
+    return nullptr;
+
+  if (SIInstrInfo::isSDWA(MI)) {
     if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
       if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
         return nullptr;
-  } else if (SIInstrInfo::isVALU(MI)) {
+  } else {
     if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) ||
         !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
           SISrcMods::DST_OP_SEL))
       return nullptr;
   }
 
-  const MachineOperand *Dst = nullptr;
-
-  if (SIInstrInfo::isVALU(MI))
-    Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
-
-  // Assume inline asm has dst forwarding hazard
-  else if (MI.isInlineAsm()) {
-    for (auto &Op :
-         llvm::drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) {
-      if (Op.isReg() && Op.isDef()) {
-        Dst = &Op;
-        break;
-      }
-    }
-  }
-
-  return Dst;
+  return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
 }
 
 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
@@ -1084,7 +1081,7 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
         auto IsShift16BitDefFn = [this](const MachineInstr &MI) {
           const MachineOperand *Dst = getDstSelForwardingOperand(MI, ST);
           // Assume inline asm reads the dst
-          return Dst ? true : false;
+          return (bool)Dst;
         };
 
         int WaitStatesNeededForDef =
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-dst-preserve-hazard.mir b/llvm/test/CodeGen/AMDGPU/sdwa-dst-preserve-hazard.mir
deleted file mode 100644
index ec3b61c4776eef..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/sdwa-dst-preserve-hazard.mir
+++ /dev/null
@@ -1,54 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=HAZARD %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=NOHAZARD %s
-
----
-name:            hazard_vcmpx_sdwa_permlane16
-body:            |
-  bb.0:
-  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr5, $vgpr6
-    ; HAZARD-LABEL: name: hazard_vcmpx_sdwa_permlane16
-    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr5, $vgpr6
-    ; HAZARD-NEXT: {{  $}}
-    ; HAZARD-NEXT: renamable $vgpr7 = COPY $vgpr6, implicit $exec
-    ; HAZARD-NEXT: renamable $vgpr6 = COPY $vgpr5, implicit $exec
-    ; HAZARD-NEXT: renamable $vgpr4 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec
-    ; HAZARD-NEXT: renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr2_vgpr3, 0, 0, implicit $exec
-    ; HAZARD-NEXT: KILL killed renamable $vgpr2, renamable $vgpr3
-    ; HAZARD-NEXT: KILL killed renamable $vgpr0, renamable $vgpr1
-    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 1, 0, 3, 3, implicit $exec
-    ; HAZARD-NEXT: renamable $vgpr1 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 6, 0, 5, 5, implicit $exec
-    ; HAZARD-NEXT: renamable $vgpr0 = V_OR_B32_sdwa 0, killed $vgpr1, 0, killed $vgpr0, 0, 5, 0, 0, 6, implicit $exec
-    ; HAZARD-NEXT: S_NOP 0
-    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, killed $vgpr4, 0, killed $vgpr5, 0, 1, 2, 6, 1, implicit $exec, implicit killed $vgpr0(tied-def 0)
-    ; HAZARD-NEXT: GLOBAL_STORE_DWORD killed renamable $vgpr6_vgpr7, killed renamable $vgpr0, 0, 0, implicit $exec
-    ; HAZARD-NEXT: S_ENDPGM 0
-    ;
-    ; NOHAZARD-LABEL: name: hazard_vcmpx_sdwa_permlane16
-    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr5, $vgpr6
-    ; NOHAZARD-NEXT: {{  $}}
-    ; NOHAZARD-NEXT: renamable $vgpr7 = COPY $vgpr6, implicit $exec
-    ; NOHAZARD-NEXT: renamable $vgpr6 = COPY $vgpr5, implicit $exec
-    ; NOHAZARD-NEXT: renamable $vgpr4 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec
-    ; NOHAZARD-NEXT: renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr2_vgpr3, 0, 0, implicit $exec
-    ; NOHAZARD-NEXT: KILL killed renamable $vgpr2, renamable $vgpr3
-    ; NOHAZARD-NEXT: KILL killed renamable $vgpr0, renamable $vgpr1
-    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 1, 0, 3, 3, implicit $exec
-    ; NOHAZARD-NEXT: renamable $vgpr1 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 6, 0, 5, 5, implicit $exec
-    ; NOHAZARD-NEXT: renamable $vgpr0 = V_OR_B32_sdwa 0, killed $vgpr1, 0, killed $vgpr0, 0, 5, 0, 0, 6, implicit $exec
-    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, killed $vgpr4, 0, killed $vgpr5, 0, 1, 2, 6, 1, implicit $exec, implicit killed $vgpr0(tied-def 0)
-    ; NOHAZARD-NEXT: GLOBAL_STORE_DWORD killed renamable $vgpr6_vgpr7, killed renamable $vgpr0, 0, 0, implicit $exec
-    ; NOHAZARD-NEXT: S_ENDPGM 0
-  renamable $vgpr7 = COPY $vgpr6, implicit $exec
-  renamable $vgpr6 = COPY $vgpr5, implicit $exec
-  renamable $vgpr4 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec
-  renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr2_vgpr3, 0, 0, implicit $exec
-  KILL killed renamable $vgpr2, renamable $vgpr3
-  KILL killed renamable $vgpr0, renamable $vgpr1
-  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 1, 0, 3, 3, implicit $exec
-  renamable $vgpr1 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 6, 0, 5, 5, implicit $exec
-  renamable $vgpr0 = V_OR_B32_sdwa 0, killed $vgpr1, 0, killed $vgpr0, 0, 5, 0, 0, 6, implicit $exec
-  renamable $vgpr0 = V_ADD_U16_sdwa 0, killed $vgpr4, 0, killed $vgpr5, 0, 1, 2, 6, 1, implicit $exec, implicit killed $vgpr0(tied-def 0)
-  GLOBAL_STORE_DWORD killed renamable $vgpr6_vgpr7, killed renamable $vgpr0, 0, 0, implicit $exec
-  S_ENDPGM 0
-...

>From 63a0cf28f3aa214ab6837efbb8d733b4c56f31f4 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 24 Jul 2024 15:08:54 -0700
Subject: [PATCH 05/17] Only init TII when needed

Change-Id: I4f461219cd534229f21a36785808e785a5cb06ce
---
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index e20b1b54d5b017..1542a1056ec1d3 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -895,10 +895,10 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
     }
   }
 
-  const SIInstrInfo *TII = ST.getInstrInfo();
   if (!SIInstrInfo::isVALU(MI))
     return nullptr;
 
+  const SIInstrInfo *TII = ST.getInstrInfo();
   if (SIInstrInfo::isSDWA(MI)) {
     if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
       if (DstSel->getImm() == AMDGPU::SDWA::DWORD)

>From 7a6b34574b1b3f315bff0263a6e94a9c7b7e29e2 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 25 Jul 2024 12:12:48 -0700
Subject: [PATCH 06/17] Add snop for WAW with lo bits of dst

Change-Id: I29938c50257b6ebf302aafcd5b23d7484d06d89a
---
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 1542a1056ec1d3..ad7f694dec4a9f 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -960,11 +960,7 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
           Register ThisDef = ThisDst->getReg();
           if (!TRI->regsOverlap(Def, ThisDef))
             return false;
-          if (AMDGPU::hasNamedOperand(VALU->getOpcode(),
-                                      AMDGPU::OpName::op_sel) &&
-              TII->getNamedOperand(*VALU, AMDGPU::OpName::src0_modifiers)
-                      ->getImm() &
-                  SISrcMods::DST_OP_SEL)
+          if (TII->isVOP3(*VALU) && !TII->isVOP3P(*VALU))
             return true;
         }
       }

>From 930d15ce58e5aa7ceabd96df93d200793f46c596 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 25 Jul 2024 14:18:22 -0700
Subject: [PATCH 07/17] Review comments

Change-Id: I9882238a6227c7e6522e00fa98e1a3f18f7b4ec2
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 42 ++++++---
 llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir   | 92 ++++++++-----------
 2 files changed, 65 insertions(+), 69 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index ad7f694dec4a9f..8a6494d7599b01 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -885,16 +885,6 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
 
 static const MachineOperand *
 getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
-  // Assume inline asm has dst forwarding hazard
-  if (MI.isInlineAsm()) {
-    for (auto &Op :
-         llvm::drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) {
-      if (Op.isReg() && Op.isDef()) {
-        return &Op;
-      }
-    }
-  }
-
   if (!SIInstrInfo::isVALU(MI))
     return nullptr;
 
@@ -946,8 +936,21 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
     auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
       const SIInstrInfo *TII = ST.getInstrInfo();
       const SIRegisterInfo *TRI = ST.getRegisterInfo();
-      const MachineOperand *Dst = getDstSelForwardingOperand(MI, ST);
-      if (Dst) {
+      SmallVector<const MachineOperand *, 4> Dsts;
+      const MachineOperand *ForwardedDst = getDstSelForwardingOperand(MI, ST);
+      if (ForwardedDst) {
+        Dsts.push_back(ForwardedDst);
+      } else if (MI.isInlineAsm()) {
+        // Assume inline asm has dst forwarding hazard
+        for (auto &Op :
+             drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) {
+          if (Op.isReg() && Op.isDef()) {
+            Dsts.push_back(&Op);
+          }
+        }
+      }
+
+      for (auto Dst : Dsts) {
         Register Def = Dst->getReg();
 
         for (const MachineOperand &Use : VALU->all_uses()) {
@@ -1077,7 +1080,20 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
         auto IsShift16BitDefFn = [this](const MachineInstr &MI) {
           const MachineOperand *Dst = getDstSelForwardingOperand(MI, ST);
           // Assume inline asm reads the dst
-          return (bool)Dst;
+          if (Dst)
+            return true;
+
+          if (MI.isInlineAsm()) {
+            // Assume other inline asm has dst forwarding hazard
+            for (auto &Op :
+                 drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) {
+              if (Op.isReg() && Op.isDef()) {
+                return true;
+              }
+            }
+          }
+
+          return false;
         };
 
         int WaitStatesNeededForDef =
diff --git a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
index 55dfa8cb640091..cda7f2a679fe06 100644
--- a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
@@ -13,41 +13,42 @@ body:            |
     ; HAZARD-NEXT: {{  $}}
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
     ; HAZARD-NEXT: S_NOP 0
-    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
     ; HAZARD-NEXT: S_ENDPGM 0
     ;
     ; NOHAZARD-LABEL: name: sdwa_opsel_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
     ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
-    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
     ; NOHAZARD-NEXT: S_ENDPGM 0
   renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
-  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
   S_ENDPGM 0
 ...
 
 ---
-name:            sdwa_no_opsel_hazard
+name:            sdwa_lo_opsel_hazard
 body:            |
   bb.0:
   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
 
-    ; HAZARD-LABEL: name: sdwa_no_opsel_hazard
+    ; HAZARD-LABEL: name: sdwa_lo_opsel_hazard
     ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; HAZARD-NEXT: {{  $}}
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
-    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: S_NOP 0
+    ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 4, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
     ; HAZARD-NEXT: S_ENDPGM 0
     ;
-    ; NOHAZARD-LABEL: name: sdwa_no_opsel_hazard
+    ; NOHAZARD-LABEL: name: sdwa_lo_opsel_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
     ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
-    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 4, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
     ; NOHAZARD-NEXT: S_ENDPGM 0
   renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
-  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = V_MAD_U16_gfx9_e64 4, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
   S_ENDPGM 0
 ...
 
@@ -60,7 +61,7 @@ body:            |
     ; HAZARD-LABEL: name: opsel_sdwa_hazard
     ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; HAZARD-NEXT: {{  $}}
-    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
     ; HAZARD-NEXT: S_NOP 0
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
     ; HAZARD-NEXT: S_ENDPGM 0
@@ -68,10 +69,10 @@ body:            |
     ; NOHAZARD-LABEL: name: opsel_sdwa_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
-    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
     ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
     ; NOHAZARD-NEXT: S_ENDPGM 0
-  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
   renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
   S_ENDPGM 0
 ...
@@ -85,17 +86,17 @@ body:            |
     ; HAZARD-LABEL: name: opsel_no_sdwa_hazard
     ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; HAZARD-NEXT: {{  $}}
-    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
     ; HAZARD-NEXT: S_ENDPGM 0
     ;
     ; NOHAZARD-LABEL: name: opsel_no_sdwa_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
-    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
     ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
     ; NOHAZARD-NEXT: S_ENDPGM 0
-  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
   renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
   S_ENDPGM 0
 ...
@@ -109,17 +110,17 @@ body:            |
     ; HAZARD-LABEL: name: no_opsel_sdwa_hazard
     ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; HAZARD-NEXT: {{  $}}
-    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_e64 killed $vgpr3, killed $vgpr4, killed $vgpr2, 0, implicit $exec
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
     ; HAZARD-NEXT: S_ENDPGM 0
     ;
     ; NOHAZARD-LABEL: name: no_opsel_sdwa_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
-    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_e64 killed $vgpr3, killed $vgpr4, killed $vgpr2, 0, implicit $exec
     ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
     ; NOHAZARD-NEXT: S_ENDPGM 0
-  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = V_MAD_U16_e64 killed $vgpr3, killed $vgpr4, killed $vgpr2, 0, implicit $exec
   renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
   S_ENDPGM 0
 ...
@@ -133,67 +134,46 @@ body:            |
     ; HAZARD-LABEL: name: opsel_opsel_hazard
     ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; HAZARD-NEXT: {{  $}}
-    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
     ; HAZARD-NEXT: S_NOP 0
-    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 4, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
     ; HAZARD-NEXT: S_ENDPGM 0
     ;
     ; NOHAZARD-LABEL: name: opsel_opsel_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
-    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 4, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
     ; NOHAZARD-NEXT: S_ENDPGM 0
-  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
+  renamable $vgpr0 = V_MAD_U16_gfx9_e64 4, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
   S_ENDPGM 0
 ...
 
----
-name:            opsel_no_opsel_hazard
-body:            |
-  bb.0:
-  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
-
-    ; HAZARD-LABEL: name: opsel_no_opsel_hazard
-    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
-    ; HAZARD-NEXT: {{  $}}
-    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; HAZARD-NEXT: S_ENDPGM 0
-    ;
-    ; NOHAZARD-LABEL: name: opsel_no_opsel_hazard
-    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
-    ; NOHAZARD-NEXT: {{  $}}
-    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; NOHAZARD-NEXT: S_ENDPGM 0
-  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-  S_ENDPGM 0
-...
+# TODO -- there is no reason for s_nop
 
 ---
-name:            no_opsel_opsel_hazard
+name:            opsel_opsel_no_hazard
 body:            |
   bb.0:
   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
 
-    ; HAZARD-LABEL: name: no_opsel_opsel_hazard
+    ; HAZARD-LABEL: name: opsel_opsel_no_hazard
     ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; HAZARD-NEXT: {{  $}}
-    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
+    ; HAZARD-NEXT: S_NOP 0
+    ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
     ; HAZARD-NEXT: S_ENDPGM 0
     ;
-    ; NOHAZARD-LABEL: name: no_opsel_opsel_hazard
+    ; NOHAZARD-LABEL: name: opsel_opsel_no_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
-    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
     ; NOHAZARD-NEXT: S_ENDPGM 0
-  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
+  renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
   S_ENDPGM 0
 ...
 

>From 6bc5b9742269243dfceaf6508eda5aa003ff7c5e Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 30 Jul 2024 15:18:55 -0700
Subject: [PATCH 08/17] Add V_CVT_SR_*8_F32 as having forwarded dest + Address
 comments

Change-Id: If3662fe303c5ae4608a3fdb6bc2b231326cd4af0
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp |  40 +++--
 llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir   | 144 ++++++++++++++++--
 .../CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll     |   2 +
 3 files changed, 167 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 8a6494d7599b01..8dff228fe9de2c 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -889,14 +889,29 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
     return nullptr;
 
   const SIInstrInfo *TII = ST.getInstrInfo();
+
+  unsigned Opcode = MI.getOpcode();
+
+  // There are three different types of instructions
+  // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3 with
+  // op_sel[3] != 0, and 3. CVR_SR_FP8_F32 and CVT_SR_BF8_F32 with op_sel[3:2]
+  // != 0
+
   if (SIInstrInfo::isSDWA(MI)) {
+    // Type 1: SDWA with dst_sel != DWORD
     if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
       if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
         return nullptr;
   } else {
-    if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) ||
+    // Type 2 && Type 3: (VOP3 with op_sel[3] != 0) || (CVT_SR_FP8_F32 and
+    // CVT_SR_BF8_F32 with op_sel[3:2] != 0)
+    if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) ||
         !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
-          SISrcMods::DST_OP_SEL))
+              SISrcMods::DST_OP_SEL ||
+          ((Opcode == AMDGPU::V_CVT_SR_BF8_F32_e64 ||
+            Opcode == AMDGPU::V_CVT_SR_FP8_F32_e64) &&
+           (TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
+            SISrcMods::OP_SEL_0))))
       return nullptr;
   }
 
@@ -953,18 +968,23 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
       for (auto Dst : Dsts) {
         Register Def = Dst->getReg();
 
+        // We must consider implicit reads of the VALU. SDWA with dst_sel and
+        // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
+        // and we must account for that hazard.
         for (const MachineOperand &Use : VALU->all_uses()) {
           if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
             return true;
         }
 
-        // We also read the dst for sub 32 writes to the same register for ECC
+        if (!TII->isVOP3(*VALU))
+          return false;
+
+        // We also must account for WAW hazards. In particular, WAW hazards with
+        // dest op_sel and dest preserve semantics will read the forwarded dest
+        // for parity check for ECC. Without accounting for this hazard, the ECC
+        // will be wrong.
         if (auto *ThisDst = TII->getNamedOperand(*VALU, AMDGPU::OpName::vdst)) {
-          Register ThisDef = ThisDst->getReg();
-          if (!TRI->regsOverlap(Def, ThisDef))
-            return false;
-          if (TII->isVOP3(*VALU) && !TII->isVOP3P(*VALU))
-            return true;
+          return TRI->regsOverlap(Def, ThisDst->getReg());
         }
       }
       return false;
@@ -1077,7 +1097,7 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
       if (ST.hasDstSelForwardingHazard()) {
         const int Shift16DefWaitstates = 1;
 
-        auto IsShift16BitDefFn = [this](const MachineInstr &MI) {
+        auto IsShift16BitDefFn = [this, &IA](const MachineInstr &MI) {
           const MachineOperand *Dst = getDstSelForwardingOperand(MI, ST);
           // Assume inline asm reads the dst
           if (Dst)
@@ -1087,7 +1107,7 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
             // Assume other inline asm has dst forwarding hazard
             for (auto &Op :
                  drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) {
-              if (Op.isReg() && Op.isDef()) {
+              if (Op.isReg() && IA->modifiesRegister(Op.getReg(), &TRI)) {
                 return true;
               }
             }
diff --git a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
index cda7f2a679fe06..fd97f74d0c744d 100644
--- a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
@@ -77,20 +77,21 @@ body:            |
   S_ENDPGM 0
 ...
 
+
 ---
-name:            opsel_no_sdwa_hazard
+name:            opsel_no_sdwa_no_hazard
 body:            |
   bb.0:
   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
 
-    ; HAZARD-LABEL: name: opsel_no_sdwa_hazard
+    ; HAZARD-LABEL: name: opsel_no_sdwa_no_hazard
     ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; HAZARD-NEXT: {{  $}}
     ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
     ; HAZARD-NEXT: S_ENDPGM 0
     ;
-    ; NOHAZARD-LABEL: name: opsel_no_sdwa_hazard
+    ; NOHAZARD-LABEL: name: opsel_no_sdwa_no_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
     ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
@@ -102,19 +103,19 @@ body:            |
 ...
 
 ---
-name:            no_opsel_sdwa_hazard
+name:            no_opsel_sdwa_no_hazard
 body:            |
   bb.0:
   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
 
-    ; HAZARD-LABEL: name: no_opsel_sdwa_hazard
+    ; HAZARD-LABEL: name: no_opsel_sdwa_no_hazard
     ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; HAZARD-NEXT: {{  $}}
     ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_e64 killed $vgpr3, killed $vgpr4, killed $vgpr2, 0, implicit $exec
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
     ; HAZARD-NEXT: S_ENDPGM 0
     ;
-    ; NOHAZARD-LABEL: name: no_opsel_sdwa_hazard
+    ; NOHAZARD-LABEL: name: no_opsel_sdwa_no_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
     ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_e64 killed $vgpr3, killed $vgpr4, killed $vgpr2, 0, implicit $exec
@@ -177,6 +178,32 @@ body:            |
   S_ENDPGM 0
 ...
 
+# DS_READ_U16_D16 has dest preserve semantics, but only VALU consumers have hazard
+
+---
+name:            sdwa_loadsel_no_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: sdwa_loadsel_no_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 3, 0, 3, 3, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = DS_READ_U16_D16 killed renamable $vgpr3, 0, 0, killed renamable $vgpr0, implicit $exec
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: sdwa_loadsel_no_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 3, 0, 3, 3, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = DS_READ_U16_D16 killed renamable $vgpr3, 0, 0, killed renamable $vgpr0, implicit $exec
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 3, 0, 3, 3, implicit $exec
+  renamable $vgpr0 = DS_READ_U16_D16 killed renamable $vgpr3, 0, 0, killed renamable $vgpr0, implicit $exec
+  S_ENDPGM 0
+...
+
 ---
 name:            sdwa_sdwa_hazard
 body:            |
@@ -203,19 +230,118 @@ body:            |
 ...
 
 ---
-name:            sdwa_nosdwa_hazard
+name:            cvt_sdwa_hazard_1
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: cvt_sdwa_hazard_1
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 0, killed $vgpr3, 0, killed $vgpr1, 4, $vgpr0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: S_NOP 0
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: cvt_sdwa_hazard_1
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 0, killed $vgpr3, 0, killed $vgpr1, 4, $vgpr0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 0, killed $vgpr3, 0, killed $vgpr1, 4, $vgpr0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+  S_ENDPGM 0
+...
+
+---
+name:            cvt_sdwa_hazard_2
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: cvt_sdwa_hazard_2
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 8, killed $vgpr3, 0, killed $vgpr1, 0, $vgpr0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: S_NOP 0
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: cvt_sdwa_hazard_2
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 8, killed $vgpr3, 0, killed $vgpr1, 0, $vgpr0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 8, killed $vgpr3, 0, killed $vgpr1, 0, $vgpr0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+  S_ENDPGM 0
+...
+
+---
+name:            cvt_sdwa_hazard_3
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: cvt_sdwa_hazard_3
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 8, killed $vgpr3, 0, killed $vgpr1, 4, $vgpr0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: S_NOP 0
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: cvt_sdwa_hazard_3
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 8, killed $vgpr3, 0, killed $vgpr1, 4, $vgpr0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 8, killed $vgpr3, 0, killed $vgpr1, 4, $vgpr0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+  S_ENDPGM 0
+...
+
+---
+name:            cvt_sdwa_no_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: cvt_sdwa_no_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 0, killed $vgpr3, 0, killed $vgpr1, 0, $vgpr0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: cvt_sdwa_no_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 0, killed $vgpr3, 0, killed $vgpr1, 0, $vgpr0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 0, killed $vgpr3, 0, killed $vgpr1, 0, $vgpr0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+  S_ENDPGM 0
+...
+
+---
+name:            sdwa_nosdwa_no_hazard
 body:            |
   bb.0:
   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
 
-    ; HAZARD-LABEL: name: sdwa_nosdwa_hazard
+    ; HAZARD-LABEL: name: sdwa_nosdwa_no_hazard
     ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; HAZARD-NEXT: {{  $}}
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
     ; HAZARD-NEXT: S_ENDPGM 0
     ;
-    ; NOHAZARD-LABEL: name: sdwa_nosdwa_hazard
+    ; NOHAZARD-LABEL: name: sdwa_nosdwa_no_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
     ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
index d3fc96d7ff8012..8313f5b655efba 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
@@ -375,6 +375,7 @@ define i32 @test_cvt_sr_bf8_f32_byte1(float %x, i32 %r, i32 %old) {
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0]
+; GFX940-NEXT:    s_nop 0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -469,6 +470,7 @@ define i32 @test_cvt_sr_fp8_f32_byte1(float %x, i32 %r, i32 %old) {
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0]
+; GFX940-NEXT:    s_nop 0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;

>From 7744d2c6230f041c89458ec74eb77a820a131ed9 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 31 Jul 2024 10:43:43 -0700
Subject: [PATCH 09/17] Update comments

Change-Id: I3011b7a6092cb056bf6095966dee9bc6be0586b7
---
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 8dff228fe9de2c..b7066de1203696 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -883,6 +883,12 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
   return WaitStatesNeeded;
 }
 
+/// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
+/// pack the computed value into correct bit position of the dest register. This
+/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
+/// dst_sel that is not aligned to the register. This function analayzes the \p
+/// MI and \returns an operand with dst setl forwarding issue, or nullptr if
+/// none exists.
 static const MachineOperand *
 getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
   if (!SIInstrInfo::isVALU(MI))
@@ -893,17 +899,17 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
   unsigned Opcode = MI.getOpcode();
 
   // There are three different types of instructions
-  // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3 with
-  // op_sel[3] != 0, and 3. CVR_SR_FP8_F32 and CVT_SR_BF8_F32 with op_sel[3:2]
+  // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
+  // which write hi bits (e.g. op_sel[3] == 1), and 3. CVR_SR_FP8_F32 and
+  // CVT_SR_BF8_F32 with op_sel[3:2]
   // != 0
-
   if (SIInstrInfo::isSDWA(MI)) {
     // Type 1: SDWA with dst_sel != DWORD
     if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
       if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
         return nullptr;
   } else {
-    // Type 2 && Type 3: (VOP3 with op_sel[3] != 0) || (CVT_SR_FP8_F32 and
+    // Type 2 && Type 3: (VOP3 which write the hi bits) || (CVT_SR_FP8_F32 and
     // CVT_SR_BF8_F32 with op_sel[3:2] != 0)
     if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) ||
         !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &

>From d1b308415bdc5e349b0e36eb1a13b9ad288a23d3 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 2 Aug 2024 09:24:52 -0700
Subject: [PATCH 10/17] Insert s_nop for all RAW and WAW

Change-Id: I838ccf7fc3e7263b95a179d4c3364d6297a9ac41
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 24 +++++++------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index b7066de1203696..13ed58e80f1d78 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -887,7 +887,7 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
 /// pack the computed value into correct bit position of the dest register. This
 /// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
 /// dst_sel that is not aligned to the register. This function analayzes the \p
-/// MI and \returns an operand with dst setl forwarding issue, or nullptr if
+/// MI and \returns an operand with dst forwarding issue, or nullptr if
 /// none exists.
 static const MachineOperand *
 getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
@@ -972,25 +972,19 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
       }
 
       for (auto Dst : Dsts) {
-        Register Def = Dst->getReg();
-
         // We must consider implicit reads of the VALU. SDWA with dst_sel and
         // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
         // and we must account for that hazard.
-        for (const MachineOperand &Use : VALU->all_uses()) {
-          if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
-            return true;
-        }
-
-        if (!TII->isVOP3(*VALU))
-          return false;
-
-        // We also must account for WAW hazards. In particular, WAW hazards with
-        // dest op_sel and dest preserve semantics will read the forwarded dest
+        // We also must account for WAW hazards. In particular, WAW with dest preserve semantics
+        // (e.g. VOP3 with op_sel, VOP2 && !zeroesHigh16BitsOfDest) will read the forwarded dest
         // for parity check for ECC. Without accounting for this hazard, the ECC
         // will be wrong.
-        if (auto *ThisDst = TII->getNamedOperand(*VALU, AMDGPU::OpName::vdst)) {
-          return TRI->regsOverlap(Def, ThisDst->getReg());
+        // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e. complete zeroesHigh16BitsOfDest)
+        for (auto &Operand : VALU->operands()) {
+          if (Operand.isReg() &&
+            TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
+            return true;
+          }
         }
       }
       return false;

>From b02ce1694fb2a342e44c4bb4c5ee692711e0999e Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 2 Aug 2024 10:43:33 -0700
Subject: [PATCH 11/17] Use searchable table instead of hardcoding instructions

Change-Id: Iecb574091f5389be115d5f853f3e06bcb396f7f9
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 28 ++++++++++---------
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         | 10 +++++++
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    | 12 ++++++++
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  3 ++
 llvm/lib/Target/AMDGPU/VOP3Instructions.td    |  1 +
 llvm/lib/Target/AMDGPU/VOPInstructions.td     |  2 ++
 llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir   |  6 ++++
 7 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 13ed58e80f1d78..44c8020f512d3e 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -914,8 +914,7 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
     if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) ||
         !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
               SISrcMods::DST_OP_SEL ||
-          ((Opcode == AMDGPU::V_CVT_SR_BF8_F32_e64 ||
-            Opcode == AMDGPU::V_CVT_SR_FP8_F32_e64) &&
+          (AMDGPU::isFP8DstSelInst(Opcode) &&
            (TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
             SISrcMods::OP_SEL_0))))
       return nullptr;
@@ -955,7 +954,6 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
     const int Shift16DefWaitstates = 1;
 
     auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
-      const SIInstrInfo *TII = ST.getInstrInfo();
       const SIRegisterInfo *TRI = ST.getRegisterInfo();
       SmallVector<const MachineOperand *, 4> Dsts;
       const MachineOperand *ForwardedDst = getDstSelForwardingOperand(MI, ST);
@@ -975,14 +973,16 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
         // We must consider implicit reads of the VALU. SDWA with dst_sel and
         // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
         // and we must account for that hazard.
-        // We also must account for WAW hazards. In particular, WAW with dest preserve semantics
-        // (e.g. VOP3 with op_sel, VOP2 && !zeroesHigh16BitsOfDest) will read the forwarded dest
-        // for parity check for ECC. Without accounting for this hazard, the ECC
-        // will be wrong.
-        // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e. complete zeroesHigh16BitsOfDest)
+        // We also must account for WAW hazards. In particular, WAW with dest
+        // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
+        // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
+        // check for ECC. Without accounting for this hazard, the ECC will be
+        // wrong.
+        // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
+        // complete zeroesHigh16BitsOfDest)
         for (auto &Operand : VALU->operands()) {
           if (Operand.isReg() &&
-            TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
+              TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
             return true;
           }
         }
@@ -1088,12 +1088,14 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
   for (const MachineOperand &Op :
        llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) {
     if (Op.isReg() && Op.isDef()) {
-      WaitStatesNeeded =
-          std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
-
       if (!TRI.isVectorRegister(MRI, Op.getReg()))
         continue;
 
+      if (ST.has12DWordStoreHazard()) {
+        WaitStatesNeeded =
+            std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
+      }
+
       if (ST.hasDstSelForwardingHazard()) {
         const int Shift16DefWaitstates = 1;
 
@@ -1104,7 +1106,7 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
             return true;
 
           if (MI.isInlineAsm()) {
-            // Assume other inline asm has dst forwarding hazard
+            // If MI is inline asm, assume it has dst forwarding hazard
             for (auto &Op :
                  drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) {
               if (Op.isReg() && IA->modifiesRegister(Op.getReg(), &TRI)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index d2838349340d2e..d0cb15629ac1ee 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2325,6 +2325,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
 
   field bit IsFP8SrcByteSel = 0;
   field bit IsFP8DstByteSel = 0;
+  field bit HasFP8DstByteSel = 0;
   field bit IsFP8ByteSel = !or(IsFP8SrcByteSel, IsFP8DstByteSel);
 
   field bit HasDst = !ne(DstVT.Value, untyped.Value);
@@ -2904,6 +2905,15 @@ def getVCMPXOpFromVCMP : InstrMapping {
   let ValueCols = [["1"]];
 }
 
+def FP8DstByteSelTable : GenericTable {
+  let FilterClass = "VOP3_Pseudo";
+  let CppTypeName = "FP8DstByteSelInfo";
+  let Fields = ["Opcode", "HasFP8DstByteSel"];
+
+  let PrimaryKey = ["Opcode"];
+  let PrimaryKeyName = "getFP8DstByteSelHelper";
+}
+
 def VOPDComponentTable : GenericTable {
   let FilterClass = "VOPD_Component";
   let CppTypeName = "VOPDComponentInfo";
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 96d4863e94014c..c3a648b3c2e06d 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -385,6 +385,13 @@ struct SingleUseExceptionInfo {
   bool IsInvalidSingleUseProducer;
 };
 
+struct FP8DstByteSelInfo {
+  uint16_t Opcode;
+  bool HasFP8DstByteSel;
+};
+
+#define GET_FP8DstByteSelTable_DECL
+#define GET_FP8DstByteSelTable_IMPL
 #define GET_MTBUFInfoTable_DECL
 #define GET_MTBUFInfoTable_IMPL
 #define GET_MUBUFInfoTable_DECL
@@ -629,6 +636,11 @@ bool isInvalidSingleUseProducerInst(unsigned Opc) {
   return Info && Info->IsInvalidSingleUseProducer;
 }
 
+bool isFP8DstSelInst(unsigned Opc) {
+  const FP8DstByteSelInfo *Info = getFP8DstByteSelHelper(Opc);
+  return Info ? Info->HasFP8DstByteSel : false;
+}
+
 unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) {
   const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc);
   return Info ? Info->Opcode3Addr : ~0u;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 429c3ad335d213..cb31cd690101d0 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -861,6 +861,9 @@ getVOPDInstInfo(unsigned VOPDOpcode, const MCInstrInfo *InstrInfo);
 LLVM_READONLY
 bool isTrue16Inst(unsigned Opc);
 
+LLVM_READONLY
+bool isFP8DstSelInst(unsigned Opc);
+
 LLVM_READONLY
 bool isInvalidSingleUseConsumerInst(unsigned Opc);
 
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index efa8e9c74d4495..b3691ff48e64d7 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -568,6 +568,7 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
   let HasSrc2Mods = 1;
   let HasExtVOP3DPP = 1;
   let HasOpSel = 1;
+  let HasFP8DstByteSel = 1;
   let AsmVOP3OpSel = !subst(", $src2_modifiers", "",
                             getAsmVOP3OpSel<3, HasClamp, HasOMod,
                                             HasSrc0FloatMods, HasSrc1FloatMods,
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index f2ed17ac305a1b..ae97a66a5827bd 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -113,6 +113,8 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
   let IsWMMA = P.IsWMMA;
   let IsSWMMAC = P.IsSWMMAC;
 
+  bit HasFP8DstByteSel = !if(isVop3OpSel, P.HasFP8DstByteSel, 0);
+
   let AsmOperands = !if(isVop3OpSel,
                         P.AsmVOP3OpSel,
                         !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64));
diff --git a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
index fd97f74d0c744d..e24817078d8bc9 100644
--- a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
@@ -78,6 +78,8 @@ body:            |
 ...
 
 
+# TODO -- there is no reason for s_nop (V_ADD_U16 doesn't preserve the dest)
+
 ---
 name:            opsel_no_sdwa_no_hazard
 body:            |
@@ -88,6 +90,7 @@ body:            |
     ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; HAZARD-NEXT: {{  $}}
     ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
+    ; HAZARD-NEXT: S_NOP 0
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
     ; HAZARD-NEXT: S_ENDPGM 0
     ;
@@ -328,6 +331,8 @@ body:            |
   S_ENDPGM 0
 ...
 
+# TODO -- there is no reason for s_nop (V_ADD_U16 doesn't preserve the dest)
+
 ---
 name:            sdwa_nosdwa_no_hazard
 body:            |
@@ -338,6 +343,7 @@ body:            |
     ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; HAZARD-NEXT: {{  $}}
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+    ; HAZARD-NEXT: S_NOP 0
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
     ; HAZARD-NEXT: S_ENDPGM 0
     ;

>From 7f43ff672c193dc68508a2370832a421aaea9890 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 5 Aug 2024 09:28:10 -0700
Subject: [PATCH 12/17] Consistently use HasFP8DstByteSel

Change-Id: Iddb81c71b16d48242e26fcb598e4d824efc55a3c
---
 llvm/lib/Target/AMDGPU/VOP3Instructions.td | 1 +
 llvm/lib/Target/AMDGPU/VOPInstructions.td  | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index b3691ff48e64d7..01b1c63dd0d6e0 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -588,6 +588,7 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
 class VOP3_CVT_SR_F8_ByteSel_Profile<ValueType SrcVT> :
   VOP3_Profile<VOPProfile<[i32, SrcVT, i32, untyped]>> {
   let IsFP8DstByteSel = 1;
+  let HasFP8DstByteSel = 1;
   let HasClamp = 0;
   defvar bytesel = (ins VGPR_32:$vdst_in, ByteSel:$byte_sel);
   let Ins64 = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index ae97a66a5827bd..de9c7f22bdf598 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -113,7 +113,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
   let IsWMMA = P.IsWMMA;
   let IsSWMMAC = P.IsSWMMAC;
 
-  bit HasFP8DstByteSel = !if(isVop3OpSel, P.HasFP8DstByteSel, 0);
+  bit HasFP8DstByteSel = P.HasFP8DstByteSel;
 
   let AsmOperands = !if(isVop3OpSel,
                         P.AsmVOP3OpSel,

>From 23d28ff25bec611ab3fd64ffc2a11d276079c941 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 16 Aug 2024 12:03:04 -0700
Subject: [PATCH 13/17] Review comments

Change-Id: I001760b83d26846912a24f0ed1c3f1b9be90915a
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 53 ++++++++++---------
 1 file changed, 28 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 44c8020f512d3e..fd2f4810ec9c8c 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -923,6 +923,30 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
   return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
 }
 
+/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
+/// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
+/// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
+static bool consumesDstSelForwardingOperand(const MachineInstr *VALU,
+                                            const MachineOperand *Dst,
+                                            const SIRegisterInfo *TRI) {
+  // We must consider implicit reads of the VALU. SDWA with dst_sel and
+  // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
+  // and we must account for that hazard.
+  // We also must account for WAW hazards. In particular, WAW with dest
+  // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
+  // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
+  // check for ECC. Without accounting for this hazard, the ECC will be
+  // wrong.
+  // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
+  // complete zeroesHigh16BitsOfDest)
+  for (auto &Operand : VALU->operands()) {
+    if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
+      return true;
+    }
+  }
+  return false;
+}
+
 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
   int WaitStatesNeeded = 0;
 
@@ -955,38 +979,17 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
 
     auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
       const SIRegisterInfo *TRI = ST.getRegisterInfo();
-      SmallVector<const MachineOperand *, 4> Dsts;
       const MachineOperand *ForwardedDst = getDstSelForwardingOperand(MI, ST);
       if (ForwardedDst) {
-        Dsts.push_back(ForwardedDst);
+        return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
       } else if (MI.isInlineAsm()) {
         // Assume inline asm has dst forwarding hazard
-        for (auto &Op :
-             drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) {
-          if (Op.isReg() && Op.isDef()) {
-            Dsts.push_back(&Op);
-          }
-        }
-      }
-
-      for (auto Dst : Dsts) {
-        // We must consider implicit reads of the VALU. SDWA with dst_sel and
-        // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
-        // and we must account for that hazard.
-        // We also must account for WAW hazards. In particular, WAW with dest
-        // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
-        // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
-        // check for ECC. Without accounting for this hazard, the ECC will be
-        // wrong.
-        // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
-        // complete zeroesHigh16BitsOfDest)
-        for (auto &Operand : VALU->operands()) {
-          if (Operand.isReg() &&
-              TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
+        for (auto &Def : MI.all_defs()) {
+          if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
             return true;
-          }
         }
       }
+
       return false;
     };
 

>From a62e5c81f7760b133a1a8667366a9889250ebeed Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 19 Aug 2024 12:40:17 -0700
Subject: [PATCH 14/17] Consistent with downstream

Change-Id: I56ec0c004b941678bc5538b6eca5f02db5e1712e
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 50 ++++++++++---------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index fd2f4810ec9c8c..309560b9322030 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -982,7 +982,9 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
       const MachineOperand *ForwardedDst = getDstSelForwardingOperand(MI, ST);
       if (ForwardedDst) {
         return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
-      } else if (MI.isInlineAsm()) {
+      }
+
+      if (MI.isInlineAsm()) {
         // Assume inline asm has dst forwarding hazard
         for (auto &Def : MI.all_defs()) {
           if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
@@ -1098,35 +1100,35 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
         WaitStatesNeeded =
             std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
       }
+    }
+  }
 
-      if (ST.hasDstSelForwardingHazard()) {
-        const int Shift16DefWaitstates = 1;
+  if (ST.hasDstSelForwardingHazard()) {
+    const int Shift16DefWaitstates = 1;
 
-        auto IsShift16BitDefFn = [this, &IA](const MachineInstr &MI) {
-          const MachineOperand *Dst = getDstSelForwardingOperand(MI, ST);
-          // Assume inline asm reads the dst
-          if (Dst)
-            return true;
+    auto IsShift16BitDefFn = [this, &IA](const MachineInstr &MI) {
+      const MachineOperand *Dst = getDstSelForwardingOperand(MI, ST);
+      // Assume inline asm reads the dst
+      if (Dst)
+        return true;
 
-          if (MI.isInlineAsm()) {
-            // If MI is inline asm, assume it has dst forwarding hazard
-            for (auto &Op :
-                 drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) {
-              if (Op.isReg() && IA->modifiesRegister(Op.getReg(), &TRI)) {
-                return true;
-              }
-            }
+      if (MI.isInlineAsm()) {
+        // If MI is inline asm, assume it has dst forwarding hazard
+        for (auto &Op :
+             drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) {
+          if (Op.isReg() && IA->modifiesRegister(Op.getReg(), &TRI)) {
+            return true;
           }
+        }
+      }
 
-          return false;
-        };
+      return false;
+    };
 
-        int WaitStatesNeededForDef =
-            Shift16DefWaitstates -
-            getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
-        WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
-      }
-    }
+    int WaitStatesNeededForDef =
+        Shift16DefWaitstates -
+        getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
   }
 
   return WaitStatesNeeded;

>From 98388f59cc54d3fd00445556b335b170d1edffe7 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 19 Aug 2024 13:49:44 -0700
Subject: [PATCH 15/17] Don't process all inlinseAsm regs

Change-Id: I4afe3d78f83da3ab0f37d98047e9f5182c7c0610
---
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 309560b9322030..7d7e2516a51df8 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -1110,13 +1110,12 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
       const MachineOperand *Dst = getDstSelForwardingOperand(MI, ST);
       // Assume inline asm reads the dst
       if (Dst)
-        return true;
+        return IA->modifiesRegister(Dst->getReg(), &TRI) || IA->readsRegister(Dst->getReg(), &TRI);
 
       if (MI.isInlineAsm()) {
         // If MI is inline asm, assume it has dst forwarding hazard
-        for (auto &Op :
-             drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) {
-          if (Op.isReg() && IA->modifiesRegister(Op.getReg(), &TRI)) {
+        for (auto &Def : MI.all_defs()) {
+          if (IA->modifiesRegister(Def.getReg(), &TRI) || IA->readsRegister(Def.getReg(), &TRI)) {
             return true;
           }
         }

>From 2f4b5eec2e0a8299817be6ab1435821827f2f099 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 19 Aug 2024 14:00:40 -0700
Subject: [PATCH 16/17] Formatting

Change-Id: Ib101d4d205a09bf3c7a4c3d4a8e82a480d0c47e6
---
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 7d7e2516a51df8..82adcab56784b6 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -1110,12 +1110,14 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
       const MachineOperand *Dst = getDstSelForwardingOperand(MI, ST);
       // Assume inline asm reads the dst
       if (Dst)
-        return IA->modifiesRegister(Dst->getReg(), &TRI) || IA->readsRegister(Dst->getReg(), &TRI);
+        return IA->modifiesRegister(Dst->getReg(), &TRI) ||
+               IA->readsRegister(Dst->getReg(), &TRI);
 
       if (MI.isInlineAsm()) {
         // If MI is inline asm, assume it has dst forwarding hazard
         for (auto &Def : MI.all_defs()) {
-          if (IA->modifiesRegister(Def.getReg(), &TRI) || IA->readsRegister(Def.getReg(), &TRI)) {
+          if (IA->modifiesRegister(Def.getReg(), &TRI) ||
+              IA->readsRegister(Def.getReg(), &TRI)) {
             return true;
           }
         }

>From cd0c3c49b6be18f6a0ae1af167077cc5b7aa4500 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 21 Aug 2024 15:58:12 -0700
Subject: [PATCH 17/17] Review Comments

---
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 82adcab56784b6..e811024b73d31f 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -977,16 +977,17 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
   if (ST.hasDstSelForwardingHazard()) {
     const int Shift16DefWaitstates = 1;
 
-    auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
+    auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
       const SIRegisterInfo *TRI = ST.getRegisterInfo();
-      const MachineOperand *ForwardedDst = getDstSelForwardingOperand(MI, ST);
+      const MachineOperand *ForwardedDst =
+          getDstSelForwardingOperand(ProducerMI, ST);
       if (ForwardedDst) {
         return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
       }
 
-      if (MI.isInlineAsm()) {
+      if (ProducerMI.isInlineAsm()) {
         // Assume inline asm has dst forwarding hazard
-        for (auto &Def : MI.all_defs()) {
+        for (auto &Def : ProducerMI.all_defs()) {
           if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
             return true;
         }
@@ -1106,16 +1107,16 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
   if (ST.hasDstSelForwardingHazard()) {
     const int Shift16DefWaitstates = 1;
 
-    auto IsShift16BitDefFn = [this, &IA](const MachineInstr &MI) {
-      const MachineOperand *Dst = getDstSelForwardingOperand(MI, ST);
+    auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
+      const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST);
       // Assume inline asm reads the dst
       if (Dst)
         return IA->modifiesRegister(Dst->getReg(), &TRI) ||
                IA->readsRegister(Dst->getReg(), &TRI);
 
-      if (MI.isInlineAsm()) {
+      if (ProducerMI.isInlineAsm()) {
         // If MI is inline asm, assume it has dst forwarding hazard
-        for (auto &Def : MI.all_defs()) {
+        for (auto &Def : ProducerMI.all_defs()) {
           if (IA->modifiesRegister(Def.getReg(), &TRI) ||
               IA->readsRegister(Def.getReg(), &TRI)) {
             return true;



More information about the llvm-commits mailing list