[llvm] [AMDGPU] Correctly insert s_nops for implicit read of SDWA (PR #100276)

Jeffrey Byrnes via llvm-commits llvm-commits at lists.llvm.org
Wed Jul 31 10:50:07 PDT 2024


https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/100276

>From 92f9ef7f238ed8e2d06f148fe6e7607fdd573794 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 23 Jul 2024 15:14:14 -0700
Subject: [PATCH 1/9] [AMDGPU] Correctly insert s_nops for implicit read of
 SDWA

Change-Id: I4e22bb3764705f328827eb64704720a0d6aa1a9b
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 24 +++++++++
 .../AMDGPU/sdwa-dst-preserve-hazard.mir       | 54 +++++++++++++++++++
 2 files changed, 78 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-dst-preserve-hazard.mir

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index a402fc6d7e611..570fcf63587fd 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -935,6 +935,30 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
           if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
             return true;
         }
+
+        // If the non-impacted bits of a sdwa dst are preserved, then we
+        // read them and must resolve the hazard. SDWA dst preserve is modelled
+        // as a tied-def implicit use
+        if (SIInstrInfo::isSDWA(*VALU)) {
+          if (auto *DstSel =
+                  TII->getNamedOperand(*VALU, AMDGPU::OpName::dst_sel)) {
+            if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
+              return false;
+            if (auto *ThisDst =
+                    TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
+              if (!TRI->regsOverlap(Def, ThisDst->getReg()))
+                return false;
+              for (const MachineOperand &ImplicitOp :
+                   VALU->implicit_operands()) {
+                if (ImplicitOp.isDef())
+                  continue;
+                if (ImplicitOp.isReg() &&
+                    TRI->regsOverlap(Def, ImplicitOp.getReg()))
+                  return true;
+              }
+            }
+          }
+        }
       }
 
       return false;
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-dst-preserve-hazard.mir b/llvm/test/CodeGen/AMDGPU/sdwa-dst-preserve-hazard.mir
new file mode 100644
index 0000000000000..ec3b61c4776ee
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-dst-preserve-hazard.mir
@@ -0,0 +1,54 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=HAZARD %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=NOHAZARD %s
+
+---
+name:            hazard_vcmpx_sdwa_permlane16
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr5, $vgpr6
+    ; HAZARD-LABEL: name: hazard_vcmpx_sdwa_permlane16
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr5, $vgpr6
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr7 = COPY $vgpr6, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr6 = COPY $vgpr5, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr4 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr2_vgpr3, 0, 0, implicit $exec
+    ; HAZARD-NEXT: KILL killed renamable $vgpr2, renamable $vgpr3
+    ; HAZARD-NEXT: KILL killed renamable $vgpr0, renamable $vgpr1
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 1, 0, 3, 3, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr1 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 6, 0, 5, 5, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = V_OR_B32_sdwa 0, killed $vgpr1, 0, killed $vgpr0, 0, 5, 0, 0, 6, implicit $exec
+    ; HAZARD-NEXT: S_NOP 0
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, killed $vgpr4, 0, killed $vgpr5, 0, 1, 2, 6, 1, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; HAZARD-NEXT: GLOBAL_STORE_DWORD killed renamable $vgpr6_vgpr7, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: hazard_vcmpx_sdwa_permlane16
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr5, $vgpr6
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr7 = COPY $vgpr6, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr6 = COPY $vgpr5, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr4 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr2_vgpr3, 0, 0, implicit $exec
+    ; NOHAZARD-NEXT: KILL killed renamable $vgpr2, renamable $vgpr3
+    ; NOHAZARD-NEXT: KILL killed renamable $vgpr0, renamable $vgpr1
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 1, 0, 3, 3, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr1 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 6, 0, 5, 5, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_OR_B32_sdwa 0, killed $vgpr1, 0, killed $vgpr0, 0, 5, 0, 0, 6, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, killed $vgpr4, 0, killed $vgpr5, 0, 1, 2, 6, 1, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; NOHAZARD-NEXT: GLOBAL_STORE_DWORD killed renamable $vgpr6_vgpr7, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr7 = COPY $vgpr6, implicit $exec
+  renamable $vgpr6 = COPY $vgpr5, implicit $exec
+  renamable $vgpr4 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+  renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr2_vgpr3, 0, 0, implicit $exec
+  KILL killed renamable $vgpr2, renamable $vgpr3
+  KILL killed renamable $vgpr0, renamable $vgpr1
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 1, 0, 3, 3, implicit $exec
+  renamable $vgpr1 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 6, 0, 5, 5, implicit $exec
+  renamable $vgpr0 = V_OR_B32_sdwa 0, killed $vgpr1, 0, killed $vgpr0, 0, 5, 0, 0, 6, implicit $exec
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, killed $vgpr4, 0, killed $vgpr5, 0, 1, 2, 6, 1, implicit $exec, implicit killed $vgpr0(tied-def 0)
+  GLOBAL_STORE_DWORD killed renamable $vgpr6_vgpr7, killed renamable $vgpr0, 0, 0, implicit $exec
+  S_ENDPGM 0
+...

>From c6b2c2c0bbcf8d3ae9f1b633db37d7b0fa51ee1a Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 24 Jul 2024 12:36:40 -0700
Subject: [PATCH 2/9] Handle opsel case + address review comments

Change-Id: I131dbb24762a53f31af0aa685c48b6c14233bf3d
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp |  36 +++---
 llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir   | 103 ++++++++++++++++++
 2 files changed, 116 insertions(+), 23 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 570fcf63587fd..66349e9527725 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -875,6 +875,7 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
     return DataIdx >= 0 &&
            TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
   };
+
   int WaitStatesNeededForDef =
     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
@@ -931,33 +932,22 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
       if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
         Register Def = Dst->getReg();
 
-        for (const MachineOperand &Use : VALU->explicit_uses()) {
+        for (const MachineOperand &Use : VALU->all_uses()) {
           if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
             return true;
         }
 
-        // If the non-impacted bits of a sdwa dst are preserved, then we
-        // read them and must resolve the hazard. SDWA dst preserve is modelled
-        // as a tied-def implicit use
-        if (SIInstrInfo::isSDWA(*VALU)) {
-          if (auto *DstSel =
-                  TII->getNamedOperand(*VALU, AMDGPU::OpName::dst_sel)) {
-            if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
-              return false;
-            if (auto *ThisDst =
-                    TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
-              if (!TRI->regsOverlap(Def, ThisDst->getReg()))
-                return false;
-              for (const MachineOperand &ImplicitOp :
-                   VALU->implicit_operands()) {
-                if (ImplicitOp.isDef())
-                  continue;
-                if (ImplicitOp.isReg() &&
-                    TRI->regsOverlap(Def, ImplicitOp.getReg()))
-                  return true;
-              }
-            }
-          }
+        // We also read the dst for sub 32 writes to the same register for ECC
+        if (auto *ThisDst = TII->getNamedOperand(*VALU, AMDGPU::OpName::vdst)) {
+          Register ThisDef = ThisDst->getReg();
+          if (!TRI->regsOverlap(Def, ThisDef))
+            return false;
+          if (AMDGPU::hasNamedOperand(VALU->getOpcode(),
+                                      AMDGPU::OpName::op_sel) &&
+              TII->getNamedOperand(*VALU, AMDGPU::OpName::src0_modifiers)
+                      ->getImm() &
+                  SISrcMods::DST_OP_SEL)
+            return true;
         }
       }
 
diff --git a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
new file mode 100644
index 0000000000000..a8a26b49a57e9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
@@ -0,0 +1,103 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=HAZARD %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=NOHAZARD %s
+
+---
+name:            sdwa_opsel_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: sdwa_opsel_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+    ; HAZARD-NEXT: S_NOP 0
+    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: sdwa_opsel_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            opsel_sdwa_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: opsel_sdwa_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: S_NOP 0
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: opsel_sdwa_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+  S_ENDPGM 0
+...
+
+---
+name:            opsel_opsel_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: opsel_opsel_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: S_NOP 0
+    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: opsel_opsel_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            sdwa_sdwa_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: sdwa_sdwa_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+    ; HAZARD-NEXT: S_NOP 0
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: sdwa_sdwa_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+  S_ENDPGM 0
+...

>From 86c8316e99a9e5ccdca8d32e2d92890062abafd2 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 24 Jul 2024 13:59:08 -0700
Subject: [PATCH 3/9] Add minimal support for INLINEASM correctness

Change-Id: Icbe306650f5513787db77acd49eedbb08d8f2149
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp |  70 ++++--
 llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir   | 221 ++++++++++++++++++
 2 files changed, 275 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 66349e9527725..c87cdd043b889 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -883,6 +883,39 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
   return WaitStatesNeeded;
 }
 
+static const MachineOperand *
+getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  if (SIInstrInfo::isVALU(MI) && SIInstrInfo::isSDWA(MI)) {
+    if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
+      if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
+        return nullptr;
+  } else if (SIInstrInfo::isVALU(MI)) {
+    if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) ||
+        !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
+          SISrcMods::DST_OP_SEL))
+      return nullptr;
+  }
+
+  const MachineOperand *Dst = nullptr;
+
+  if (SIInstrInfo::isVALU(MI))
+    Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+
+  // Assume inline asm has dst forwarding hazard
+  else if (MI.isInlineAsm()) {
+    for (auto &Op :
+         llvm::drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) {
+      if (Op.isReg() && Op.isDef()) {
+        Dst = &Op;
+        break;
+      }
+    }
+  }
+
+  return Dst;
+}
+
 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
   int WaitStatesNeeded = 0;
 
@@ -914,22 +947,10 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
     const int Shift16DefWaitstates = 1;
 
     auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
-      if (!SIInstrInfo::isVALU(MI))
-        return false;
       const SIInstrInfo *TII = ST.getInstrInfo();
-      if (SIInstrInfo::isSDWA(MI)) {
-        if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
-          if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
-            return false;
-      } else {
-        if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) ||
-            !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)
-                  ->getImm() &
-              SISrcMods::DST_OP_SEL))
-          return false;
-      }
       const SIRegisterInfo *TRI = ST.getRegisterInfo();
-      if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
+      const MachineOperand *Dst = getDstSelForwardingOperand(MI, ST);
+      if (Dst) {
         Register Def = Dst->getReg();
 
         for (const MachineOperand &Use : VALU->all_uses()) {
@@ -950,7 +971,6 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
             return true;
         }
       }
-
       return false;
     };
 
@@ -1043,7 +1063,7 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
   // problematic thus far.
 
   // see checkVALUHazards()
-  if (!ST.has12DWordStoreHazard())
+  if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard())
     return 0;
 
   const MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -1054,6 +1074,24 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
     if (Op.isReg() && Op.isDef()) {
       WaitStatesNeeded =
           std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
+
+      if (!TRI.isVectorRegister(MRI, Op.getReg()))
+        continue;
+
+      if (ST.hasDstSelForwardingHazard()) {
+        const int Shift16DefWaitstates = 1;
+
+        auto IsShift16BitDefFn = [this](const MachineInstr &MI) {
+          const MachineOperand *Dst = getDstSelForwardingOperand(MI, ST);
+          // Assume inline asm reads the dst
+          return Dst ? true : false;
+        };
+
+        int WaitStatesNeededForDef =
+            Shift16DefWaitstates -
+            getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
+        WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+      }
     }
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
index a8a26b49a57e9..55dfa8cb64009 100644
--- a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
@@ -27,6 +27,30 @@ body:            |
   S_ENDPGM 0
 ...
 
+---
+name:            sdwa_no_opsel_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: sdwa_no_opsel_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: sdwa_no_opsel_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
 ---
 name:            opsel_sdwa_hazard
 body:            |
@@ -52,6 +76,54 @@ body:            |
   S_ENDPGM 0
 ...
 
+---
+name:            opsel_no_sdwa_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: opsel_no_sdwa_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: opsel_no_sdwa_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            no_opsel_sdwa_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: no_opsel_sdwa_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: no_opsel_sdwa_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+  S_ENDPGM 0
+...
+
 ---
 name:            opsel_opsel_hazard
 body:            |
@@ -77,6 +149,54 @@ body:            |
   S_ENDPGM 0
 ...
 
+---
+name:            opsel_no_opsel_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: opsel_no_opsel_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: opsel_no_opsel_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            no_opsel_opsel_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: no_opsel_opsel_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: no_opsel_opsel_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
 ---
 name:            sdwa_sdwa_hazard
 body:            |
@@ -101,3 +221,104 @@ body:            |
   renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
   S_ENDPGM 0
 ...
+
+---
+name:            sdwa_nosdwa_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: sdwa_nosdwa_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: sdwa_nosdwa_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            inline_sdwa_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: inline_sdwa_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1
+    ; HAZARD-NEXT: S_NOP 0
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: inline_sdwa_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  INLINEASM &"v_or_b32 %0, 0, %1", 32, 327690, def $vgpr0, 327689, $vgpr1
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+  S_ENDPGM 0
+...
+
+---
+name:            sdwa_inline_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: sdwa_inline_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; HAZARD-NEXT: S_NOP 0
+    ; HAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: sdwa_inline_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+  INLINEASM &"v_or_b32 %0, 0, %1", 32, 327690, def $vgpr0, 327689, $vgpr1
+  S_ENDPGM 0
+...
+
+
+---
+name:            inline_inline_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: inline_inline_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1
+    ; HAZARD-NEXT: S_NOP 0
+    ; HAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: inline_inline_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1
+    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  INLINEASM &"v_or_b32 %0, 0, %1", 32, 327690, def $vgpr0, 327689, $vgpr1
+  INLINEASM &"v_or_b32 %0, 0, %1", 32, 327690, def $vgpr0, 327689, $vgpr1
+  S_ENDPGM 0
+...
+

>From b292ded9d932d9724fe63541d303e40f95fe66a4 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 24 Jul 2024 15:03:11 -0700
Subject: [PATCH 4/9] Review comments

Change-Id: Ib799d777e86306ce2ad4d74c421123e4420668e4
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 37 ++++++-------
 .../AMDGPU/sdwa-dst-preserve-hazard.mir       | 54 -------------------
 2 files changed, 17 insertions(+), 74 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-dst-preserve-hazard.mir

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index c87cdd043b889..e20b1b54d5b01 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -885,35 +885,32 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
 
 static const MachineOperand *
 getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
+  // Assume inline asm has dst forwarding hazard
+  if (MI.isInlineAsm()) {
+    for (auto &Op :
+         llvm::drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) {
+      if (Op.isReg() && Op.isDef()) {
+        return &Op;
+      }
+    }
+  }
+
   const SIInstrInfo *TII = ST.getInstrInfo();
-  if (SIInstrInfo::isVALU(MI) && SIInstrInfo::isSDWA(MI)) {
+  if (!SIInstrInfo::isVALU(MI))
+    return nullptr;
+
+  if (SIInstrInfo::isSDWA(MI)) {
     if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
       if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
         return nullptr;
-  } else if (SIInstrInfo::isVALU(MI)) {
+  } else {
     if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) ||
         !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
           SISrcMods::DST_OP_SEL))
       return nullptr;
   }
 
-  const MachineOperand *Dst = nullptr;
-
-  if (SIInstrInfo::isVALU(MI))
-    Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
-
-  // Assume inline asm has dst forwarding hazard
-  else if (MI.isInlineAsm()) {
-    for (auto &Op :
-         llvm::drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) {
-      if (Op.isReg() && Op.isDef()) {
-        Dst = &Op;
-        break;
-      }
-    }
-  }
-
-  return Dst;
+  return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
 }
 
 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
@@ -1084,7 +1081,7 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
         auto IsShift16BitDefFn = [this](const MachineInstr &MI) {
           const MachineOperand *Dst = getDstSelForwardingOperand(MI, ST);
           // Assume inline asm reads the dst
-          return Dst ? true : false;
+          return (bool)Dst;
         };
 
         int WaitStatesNeededForDef =
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-dst-preserve-hazard.mir b/llvm/test/CodeGen/AMDGPU/sdwa-dst-preserve-hazard.mir
deleted file mode 100644
index ec3b61c4776ee..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/sdwa-dst-preserve-hazard.mir
+++ /dev/null
@@ -1,54 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=HAZARD %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=NOHAZARD %s
-
----
-name:            hazard_vcmpx_sdwa_permlane16
-body:            |
-  bb.0:
-  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr5, $vgpr6
-    ; HAZARD-LABEL: name: hazard_vcmpx_sdwa_permlane16
-    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr5, $vgpr6
-    ; HAZARD-NEXT: {{  $}}
-    ; HAZARD-NEXT: renamable $vgpr7 = COPY $vgpr6, implicit $exec
-    ; HAZARD-NEXT: renamable $vgpr6 = COPY $vgpr5, implicit $exec
-    ; HAZARD-NEXT: renamable $vgpr4 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec
-    ; HAZARD-NEXT: renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr2_vgpr3, 0, 0, implicit $exec
-    ; HAZARD-NEXT: KILL killed renamable $vgpr2, renamable $vgpr3
-    ; HAZARD-NEXT: KILL killed renamable $vgpr0, renamable $vgpr1
-    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 1, 0, 3, 3, implicit $exec
-    ; HAZARD-NEXT: renamable $vgpr1 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 6, 0, 5, 5, implicit $exec
-    ; HAZARD-NEXT: renamable $vgpr0 = V_OR_B32_sdwa 0, killed $vgpr1, 0, killed $vgpr0, 0, 5, 0, 0, 6, implicit $exec
-    ; HAZARD-NEXT: S_NOP 0
-    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, killed $vgpr4, 0, killed $vgpr5, 0, 1, 2, 6, 1, implicit $exec, implicit killed $vgpr0(tied-def 0)
-    ; HAZARD-NEXT: GLOBAL_STORE_DWORD killed renamable $vgpr6_vgpr7, killed renamable $vgpr0, 0, 0, implicit $exec
-    ; HAZARD-NEXT: S_ENDPGM 0
-    ;
-    ; NOHAZARD-LABEL: name: hazard_vcmpx_sdwa_permlane16
-    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr5, $vgpr6
-    ; NOHAZARD-NEXT: {{  $}}
-    ; NOHAZARD-NEXT: renamable $vgpr7 = COPY $vgpr6, implicit $exec
-    ; NOHAZARD-NEXT: renamable $vgpr6 = COPY $vgpr5, implicit $exec
-    ; NOHAZARD-NEXT: renamable $vgpr4 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec
-    ; NOHAZARD-NEXT: renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr2_vgpr3, 0, 0, implicit $exec
-    ; NOHAZARD-NEXT: KILL killed renamable $vgpr2, renamable $vgpr3
-    ; NOHAZARD-NEXT: KILL killed renamable $vgpr0, renamable $vgpr1
-    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 1, 0, 3, 3, implicit $exec
-    ; NOHAZARD-NEXT: renamable $vgpr1 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 6, 0, 5, 5, implicit $exec
-    ; NOHAZARD-NEXT: renamable $vgpr0 = V_OR_B32_sdwa 0, killed $vgpr1, 0, killed $vgpr0, 0, 5, 0, 0, 6, implicit $exec
-    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, killed $vgpr4, 0, killed $vgpr5, 0, 1, 2, 6, 1, implicit $exec, implicit killed $vgpr0(tied-def 0)
-    ; NOHAZARD-NEXT: GLOBAL_STORE_DWORD killed renamable $vgpr6_vgpr7, killed renamable $vgpr0, 0, 0, implicit $exec
-    ; NOHAZARD-NEXT: S_ENDPGM 0
-  renamable $vgpr7 = COPY $vgpr6, implicit $exec
-  renamable $vgpr6 = COPY $vgpr5, implicit $exec
-  renamable $vgpr4 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec
-  renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr2_vgpr3, 0, 0, implicit $exec
-  KILL killed renamable $vgpr2, renamable $vgpr3
-  KILL killed renamable $vgpr0, renamable $vgpr1
-  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 1, 0, 3, 3, implicit $exec
-  renamable $vgpr1 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 6, 0, 5, 5, implicit $exec
-  renamable $vgpr0 = V_OR_B32_sdwa 0, killed $vgpr1, 0, killed $vgpr0, 0, 5, 0, 0, 6, implicit $exec
-  renamable $vgpr0 = V_ADD_U16_sdwa 0, killed $vgpr4, 0, killed $vgpr5, 0, 1, 2, 6, 1, implicit $exec, implicit killed $vgpr0(tied-def 0)
-  GLOBAL_STORE_DWORD killed renamable $vgpr6_vgpr7, killed renamable $vgpr0, 0, 0, implicit $exec
-  S_ENDPGM 0
-...

>From 63a0cf28f3aa214ab6837efbb8d733b4c56f31f4 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 24 Jul 2024 15:08:54 -0700
Subject: [PATCH 5/9] Only init TII when needed

Change-Id: I4f461219cd534229f21a36785808e785a5cb06ce
---
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index e20b1b54d5b01..1542a1056ec1d 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -895,10 +895,10 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
     }
   }
 
-  const SIInstrInfo *TII = ST.getInstrInfo();
   if (!SIInstrInfo::isVALU(MI))
     return nullptr;
 
+  const SIInstrInfo *TII = ST.getInstrInfo();
   if (SIInstrInfo::isSDWA(MI)) {
     if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
       if (DstSel->getImm() == AMDGPU::SDWA::DWORD)

>From 7a6b34574b1b3f315bff0263a6e94a9c7b7e29e2 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 25 Jul 2024 12:12:48 -0700
Subject: [PATCH 6/9] Add snop for WAW with lo bits of dst

Change-Id: I29938c50257b6ebf302aafcd5b23d7484d06d89a
---
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 1542a1056ec1d..ad7f694dec4a9 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -960,11 +960,7 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
           Register ThisDef = ThisDst->getReg();
           if (!TRI->regsOverlap(Def, ThisDef))
             return false;
-          if (AMDGPU::hasNamedOperand(VALU->getOpcode(),
-                                      AMDGPU::OpName::op_sel) &&
-              TII->getNamedOperand(*VALU, AMDGPU::OpName::src0_modifiers)
-                      ->getImm() &
-                  SISrcMods::DST_OP_SEL)
+          if (TII->isVOP3(*VALU) && !TII->isVOP3P(*VALU))
             return true;
         }
       }

>From 930d15ce58e5aa7ceabd96df93d200793f46c596 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 25 Jul 2024 14:18:22 -0700
Subject: [PATCH 7/9] Review comments

Change-Id: I9882238a6227c7e6522e00fa98e1a3f18f7b4ec2
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 42 ++++++---
 llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir   | 92 ++++++++-----------
 2 files changed, 65 insertions(+), 69 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index ad7f694dec4a9..8a6494d7599b0 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -885,16 +885,6 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
 
 static const MachineOperand *
 getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
-  // Assume inline asm has dst forwarding hazard
-  if (MI.isInlineAsm()) {
-    for (auto &Op :
-         llvm::drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) {
-      if (Op.isReg() && Op.isDef()) {
-        return &Op;
-      }
-    }
-  }
-
   if (!SIInstrInfo::isVALU(MI))
     return nullptr;
 
@@ -946,8 +936,21 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
     auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
       const SIInstrInfo *TII = ST.getInstrInfo();
       const SIRegisterInfo *TRI = ST.getRegisterInfo();
-      const MachineOperand *Dst = getDstSelForwardingOperand(MI, ST);
-      if (Dst) {
+      SmallVector<const MachineOperand *, 4> Dsts;
+      const MachineOperand *ForwardedDst = getDstSelForwardingOperand(MI, ST);
+      if (ForwardedDst) {
+        Dsts.push_back(ForwardedDst);
+      } else if (MI.isInlineAsm()) {
+        // Assume inline asm has dst forwarding hazard
+        for (auto &Op :
+             drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) {
+          if (Op.isReg() && Op.isDef()) {
+            Dsts.push_back(&Op);
+          }
+        }
+      }
+
+      for (auto Dst : Dsts) {
         Register Def = Dst->getReg();
 
         for (const MachineOperand &Use : VALU->all_uses()) {
@@ -1077,7 +1080,20 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
         auto IsShift16BitDefFn = [this](const MachineInstr &MI) {
           const MachineOperand *Dst = getDstSelForwardingOperand(MI, ST);
           // Assume inline asm reads the dst
-          return (bool)Dst;
+          if (Dst)
+            return true;
+
+          if (MI.isInlineAsm()) {
+            // Assume other inline asm has dst forwarding hazard
+            for (auto &Op :
+                 drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) {
+              if (Op.isReg() && Op.isDef()) {
+                return true;
+              }
+            }
+          }
+
+          return false;
         };
 
         int WaitStatesNeededForDef =
diff --git a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
index 55dfa8cb64009..cda7f2a679fe0 100644
--- a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
@@ -13,41 +13,42 @@ body:            |
     ; HAZARD-NEXT: {{  $}}
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
     ; HAZARD-NEXT: S_NOP 0
-    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
     ; HAZARD-NEXT: S_ENDPGM 0
     ;
     ; NOHAZARD-LABEL: name: sdwa_opsel_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
     ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
-    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
     ; NOHAZARD-NEXT: S_ENDPGM 0
   renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
-  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
   S_ENDPGM 0
 ...
 
 ---
-name:            sdwa_no_opsel_hazard
+name:            sdwa_lo_opsel_hazard
 body:            |
   bb.0:
   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
 
-    ; HAZARD-LABEL: name: sdwa_no_opsel_hazard
+    ; HAZARD-LABEL: name: sdwa_lo_opsel_hazard
     ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; HAZARD-NEXT: {{  $}}
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
-    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: S_NOP 0
+    ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 4, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
     ; HAZARD-NEXT: S_ENDPGM 0
     ;
-    ; NOHAZARD-LABEL: name: sdwa_no_opsel_hazard
+    ; NOHAZARD-LABEL: name: sdwa_lo_opsel_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
     ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
-    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 4, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
     ; NOHAZARD-NEXT: S_ENDPGM 0
   renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
-  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = V_MAD_U16_gfx9_e64 4, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
   S_ENDPGM 0
 ...
 
@@ -60,7 +61,7 @@ body:            |
     ; HAZARD-LABEL: name: opsel_sdwa_hazard
     ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; HAZARD-NEXT: {{  $}}
-    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
     ; HAZARD-NEXT: S_NOP 0
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
     ; HAZARD-NEXT: S_ENDPGM 0
@@ -68,10 +69,10 @@ body:            |
     ; NOHAZARD-LABEL: name: opsel_sdwa_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
-    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
     ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
     ; NOHAZARD-NEXT: S_ENDPGM 0
-  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
   renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
   S_ENDPGM 0
 ...
@@ -85,17 +86,17 @@ body:            |
     ; HAZARD-LABEL: name: opsel_no_sdwa_hazard
     ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; HAZARD-NEXT: {{  $}}
-    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
     ; HAZARD-NEXT: S_ENDPGM 0
     ;
     ; NOHAZARD-LABEL: name: opsel_no_sdwa_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
-    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
     ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
     ; NOHAZARD-NEXT: S_ENDPGM 0
-  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
   renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
   S_ENDPGM 0
 ...
@@ -109,17 +110,17 @@ body:            |
     ; HAZARD-LABEL: name: no_opsel_sdwa_hazard
     ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; HAZARD-NEXT: {{  $}}
-    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_e64 killed $vgpr3, killed $vgpr4, killed $vgpr2, 0, implicit $exec
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
     ; HAZARD-NEXT: S_ENDPGM 0
     ;
     ; NOHAZARD-LABEL: name: no_opsel_sdwa_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
-    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_e64 killed $vgpr3, killed $vgpr4, killed $vgpr2, 0, implicit $exec
     ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
     ; NOHAZARD-NEXT: S_ENDPGM 0
-  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = V_MAD_U16_e64 killed $vgpr3, killed $vgpr4, killed $vgpr2, 0, implicit $exec
   renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
   S_ENDPGM 0
 ...
@@ -133,67 +134,46 @@ body:            |
     ; HAZARD-LABEL: name: opsel_opsel_hazard
     ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; HAZARD-NEXT: {{  $}}
-    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
     ; HAZARD-NEXT: S_NOP 0
-    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 4, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
     ; HAZARD-NEXT: S_ENDPGM 0
     ;
     ; NOHAZARD-LABEL: name: opsel_opsel_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
-    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 4, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
     ; NOHAZARD-NEXT: S_ENDPGM 0
-  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
+  renamable $vgpr0 = V_MAD_U16_gfx9_e64 4, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
   S_ENDPGM 0
 ...
 
----
-name:            opsel_no_opsel_hazard
-body:            |
-  bb.0:
-  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
-
-    ; HAZARD-LABEL: name: opsel_no_opsel_hazard
-    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
-    ; HAZARD-NEXT: {{  $}}
-    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; HAZARD-NEXT: S_ENDPGM 0
-    ;
-    ; NOHAZARD-LABEL: name: opsel_no_opsel_hazard
-    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
-    ; NOHAZARD-NEXT: {{  $}}
-    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; NOHAZARD-NEXT: S_ENDPGM 0
-  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-  S_ENDPGM 0
-...
+# TODO -- there is no reason for s_nop
 
 ---
-name:            no_opsel_opsel_hazard
+name:            opsel_opsel_no_hazard
 body:            |
   bb.0:
   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
 
-    ; HAZARD-LABEL: name: no_opsel_opsel_hazard
+    ; HAZARD-LABEL: name: opsel_opsel_no_hazard
     ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; HAZARD-NEXT: {{  $}}
-    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; HAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
+    ; HAZARD-NEXT: S_NOP 0
+    ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
     ; HAZARD-NEXT: S_ENDPGM 0
     ;
-    ; NOHAZARD-LABEL: name: no_opsel_opsel_hazard
+    ; NOHAZARD-LABEL: name: opsel_opsel_no_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
-    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; NOHAZARD-NEXT: renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
     ; NOHAZARD-NEXT: S_ENDPGM 0
-  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 12, killed $vgpr3, 8, killed $vgpr4, 8, killed $vgpr2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-  renamable $vgpr0 = nofpexcept V_PK_FMA_F16 0, killed $vgpr1, 8, killed $vgpr4, 8, killed $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
+  renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
   S_ENDPGM 0
 ...
 

>From 6bc5b9742269243dfceaf6508eda5aa003ff7c5e Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 30 Jul 2024 15:18:55 -0700
Subject: [PATCH 8/9] Add V_CVT_SR_*8_F32 as having forwarded dest + Address
 comments

Change-Id: If3662fe303c5ae4608a3fdb6bc2b231326cd4af0
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp |  40 +++--
 llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir   | 144 ++++++++++++++++--
 .../CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll     |   2 +
 3 files changed, 167 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 8a6494d7599b0..8dff228fe9de2 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -889,14 +889,29 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
     return nullptr;
 
   const SIInstrInfo *TII = ST.getInstrInfo();
+
+  unsigned Opcode = MI.getOpcode();
+
+  // There are three different types of instructions
+  // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3 with
+  // op_sel[3] != 0, and 3. CVR_SR_FP8_F32 and CVT_SR_BF8_F32 with op_sel[3:2]
+  // != 0
+
   if (SIInstrInfo::isSDWA(MI)) {
+    // Type 1: SDWA with dst_sel != DWORD
     if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
       if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
         return nullptr;
   } else {
-    if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) ||
+    // Type 2 && Type 3: (VOP3 with op_sel[3] != 0) || (CVT_SR_FP8_F32 and
+    // CVT_SR_BF8_F32 with op_sel[3:2] != 0)
+    if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) ||
         !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
-          SISrcMods::DST_OP_SEL))
+              SISrcMods::DST_OP_SEL ||
+          ((Opcode == AMDGPU::V_CVT_SR_BF8_F32_e64 ||
+            Opcode == AMDGPU::V_CVT_SR_FP8_F32_e64) &&
+           (TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
+            SISrcMods::OP_SEL_0))))
       return nullptr;
   }
 
@@ -953,18 +968,23 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
       for (auto Dst : Dsts) {
         Register Def = Dst->getReg();
 
+        // We must consider implicit reads of the VALU. SDWA with dst_sel and
+        // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
+        // and we must account for that hazard.
         for (const MachineOperand &Use : VALU->all_uses()) {
           if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
             return true;
         }
 
-        // We also read the dst for sub 32 writes to the same register for ECC
+        if (!TII->isVOP3(*VALU))
+          return false;
+
+        // We also must account for WAW hazards. In particular, WAW hazards with
+        // dest op_sel and dest preserve semantics will read the forwarded dest
+        // for parity check for ECC. Without accounting for this hazard, the ECC
+        // will be wrong.
         if (auto *ThisDst = TII->getNamedOperand(*VALU, AMDGPU::OpName::vdst)) {
-          Register ThisDef = ThisDst->getReg();
-          if (!TRI->regsOverlap(Def, ThisDef))
-            return false;
-          if (TII->isVOP3(*VALU) && !TII->isVOP3P(*VALU))
-            return true;
+          return TRI->regsOverlap(Def, ThisDst->getReg());
         }
       }
       return false;
@@ -1077,7 +1097,7 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
       if (ST.hasDstSelForwardingHazard()) {
         const int Shift16DefWaitstates = 1;
 
-        auto IsShift16BitDefFn = [this](const MachineInstr &MI) {
+        auto IsShift16BitDefFn = [this, &IA](const MachineInstr &MI) {
           const MachineOperand *Dst = getDstSelForwardingOperand(MI, ST);
           // Assume inline asm reads the dst
           if (Dst)
@@ -1087,7 +1107,7 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
             // Assume other inline asm has dst forwarding hazard
             for (auto &Op :
                  drop_begin(MI.operands(), InlineAsm::MIOp_FirstOperand)) {
-              if (Op.isReg() && Op.isDef()) {
+              if (Op.isReg() && IA->modifiesRegister(Op.getReg(), &TRI)) {
                 return true;
               }
             }
diff --git a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
index cda7f2a679fe0..fd97f74d0c744 100644
--- a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
@@ -77,20 +77,21 @@ body:            |
   S_ENDPGM 0
 ...
 
+
 ---
-name:            opsel_no_sdwa_hazard
+name:            opsel_no_sdwa_no_hazard
 body:            |
   bb.0:
   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
 
-    ; HAZARD-LABEL: name: opsel_no_sdwa_hazard
+    ; HAZARD-LABEL: name: opsel_no_sdwa_no_hazard
     ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; HAZARD-NEXT: {{  $}}
     ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
     ; HAZARD-NEXT: S_ENDPGM 0
     ;
-    ; NOHAZARD-LABEL: name: opsel_no_sdwa_hazard
+    ; NOHAZARD-LABEL: name: opsel_no_sdwa_no_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
     ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec
@@ -102,19 +103,19 @@ body:            |
 ...
 
 ---
-name:            no_opsel_sdwa_hazard
+name:            no_opsel_sdwa_no_hazard
 body:            |
   bb.0:
   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
 
-    ; HAZARD-LABEL: name: no_opsel_sdwa_hazard
+    ; HAZARD-LABEL: name: no_opsel_sdwa_no_hazard
     ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; HAZARD-NEXT: {{  $}}
     ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_e64 killed $vgpr3, killed $vgpr4, killed $vgpr2, 0, implicit $exec
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
     ; HAZARD-NEXT: S_ENDPGM 0
     ;
-    ; NOHAZARD-LABEL: name: no_opsel_sdwa_hazard
+    ; NOHAZARD-LABEL: name: no_opsel_sdwa_no_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
     ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_e64 killed $vgpr3, killed $vgpr4, killed $vgpr2, 0, implicit $exec
@@ -177,6 +178,32 @@ body:            |
   S_ENDPGM 0
 ...
 
+# DS_READ_U16_D16 has dest preserve semantics, but only VALU consumers have hazard
+
+---
+name:            sdwa_loadsel_no_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: sdwa_loadsel_no_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 3, 0, 3, 3, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = DS_READ_U16_D16 killed renamable $vgpr3, 0, 0, killed renamable $vgpr0, implicit $exec
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: sdwa_loadsel_no_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 3, 0, 3, 3, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = DS_READ_U16_D16 killed renamable $vgpr3, 0, 0, killed renamable $vgpr0, implicit $exec
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 3, 0, 3, 3, implicit $exec
+  renamable $vgpr0 = DS_READ_U16_D16 killed renamable $vgpr3, 0, 0, killed renamable $vgpr0, implicit $exec
+  S_ENDPGM 0
+...
+
 ---
 name:            sdwa_sdwa_hazard
 body:            |
@@ -203,19 +230,118 @@ body:            |
 ...
 
 ---
-name:            sdwa_nosdwa_hazard
+name:            cvt_sdwa_hazard_1
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: cvt_sdwa_hazard_1
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 0, killed $vgpr3, 0, killed $vgpr1, 4, $vgpr0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: S_NOP 0
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: cvt_sdwa_hazard_1
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 0, killed $vgpr3, 0, killed $vgpr1, 4, $vgpr0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 0, killed $vgpr3, 0, killed $vgpr1, 4, $vgpr0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+  S_ENDPGM 0
+...
+
+---
+name:            cvt_sdwa_hazard_2
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: cvt_sdwa_hazard_2
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 8, killed $vgpr3, 0, killed $vgpr1, 0, $vgpr0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: S_NOP 0
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: cvt_sdwa_hazard_2
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 8, killed $vgpr3, 0, killed $vgpr1, 0, $vgpr0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 8, killed $vgpr3, 0, killed $vgpr1, 0, $vgpr0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+  S_ENDPGM 0
+...
+
+---
+name:            cvt_sdwa_hazard_3
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: cvt_sdwa_hazard_3
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 8, killed $vgpr3, 0, killed $vgpr1, 4, $vgpr0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: S_NOP 0
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: cvt_sdwa_hazard_3
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 8, killed $vgpr3, 0, killed $vgpr1, 4, $vgpr0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 8, killed $vgpr3, 0, killed $vgpr1, 4, $vgpr0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+  S_ENDPGM 0
+...
+
+---
+name:            cvt_sdwa_no_hazard
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+
+    ; HAZARD-LABEL: name: cvt_sdwa_no_hazard
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 0, killed $vgpr3, 0, killed $vgpr1, 0, $vgpr0, 0, implicit $mode, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: cvt_sdwa_no_hazard
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 0, killed $vgpr3, 0, killed $vgpr1, 0, $vgpr0, 0, implicit $mode, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 0, killed $vgpr3, 0, killed $vgpr1, 0, $vgpr0, 0, implicit $mode, implicit $exec
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
+  S_ENDPGM 0
+...
+
+---
+name:            sdwa_nosdwa_no_hazard
 body:            |
   bb.0:
   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
 
-    ; HAZARD-LABEL: name: sdwa_nosdwa_hazard
+    ; HAZARD-LABEL: name: sdwa_nosdwa_no_hazard
     ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; HAZARD-NEXT: {{  $}}
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
     ; HAZARD-NEXT: S_ENDPGM 0
     ;
-    ; NOHAZARD-LABEL: name: sdwa_nosdwa_hazard
+    ; NOHAZARD-LABEL: name: sdwa_nosdwa_no_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
     ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
index d3fc96d7ff801..8313f5b655efb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
@@ -375,6 +375,7 @@ define i32 @test_cvt_sr_bf8_f32_byte1(float %x, i32 %r, i32 %old) {
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0]
+; GFX940-NEXT:    s_nop 0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -469,6 +470,7 @@ define i32 @test_cvt_sr_fp8_f32_byte1(float %x, i32 %r, i32 %old) {
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0]
+; GFX940-NEXT:    s_nop 0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;

>From 7744d2c6230f041c89458ec74eb77a820a131ed9 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 31 Jul 2024 10:43:43 -0700
Subject: [PATCH 9/9] Update comments

Change-Id: I3011b7a6092cb056bf6095966dee9bc6be0586b7
---
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 8dff228fe9de2..b7066de120369 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -883,6 +883,12 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
   return WaitStatesNeeded;
 }
 
+/// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
+/// pack the computed value into correct bit position of the dest register. This
+/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
+/// dst_sel that is not aligned to the register. This function analayzes the \p
+/// MI and \returns an operand with dst setl forwarding issue, or nullptr if
+/// none exists.
 static const MachineOperand *
 getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
   if (!SIInstrInfo::isVALU(MI))
@@ -893,17 +899,17 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
   unsigned Opcode = MI.getOpcode();
 
   // There are three different types of instructions
-  // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3 with
-  // op_sel[3] != 0, and 3. CVR_SR_FP8_F32 and CVT_SR_BF8_F32 with op_sel[3:2]
+  // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
+  // which write hi bits (e.g. op_sel[3] == 1), and 3. CVR_SR_FP8_F32 and
+  // CVT_SR_BF8_F32 with op_sel[3:2]
   // != 0
-
   if (SIInstrInfo::isSDWA(MI)) {
     // Type 1: SDWA with dst_sel != DWORD
     if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
       if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
         return nullptr;
   } else {
-    // Type 2 && Type 3: (VOP3 with op_sel[3] != 0) || (CVT_SR_FP8_F32 and
+    // Type 2 && Type 3: (VOP3 which write the hi bits) || (CVT_SR_FP8_F32 and
     // CVT_SR_BF8_F32 with op_sel[3:2] != 0)
     if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) ||
         !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &



More information about the llvm-commits mailing list