[llvm] [AMDGPU] Correctly insert s_nops for implicit read of SDWA (PR #100276)

Jeffrey Byrnes via llvm-commits llvm-commits at lists.llvm.org
Tue Jul 23 16:08:21 PDT 2024


https://github.com/jrbyrnes created https://github.com/llvm/llvm-project/pull/100276

MI300 ISA section 4.5 states there is a hazard between "VALU op which uses OPSEL or SDWA with changes the result’s bit position" and "VALU op consumes result of that op"

This includes the case where the second op is SDWA with dst_sel != DWORD && dst_unused == UNUSED_PRESERVE. In this case, there is an implicit read of the first op dst and the compiler needs to resolve this hazard. Confirmed with HW team.

We model dst_unused == UNUSED_PRESERVE as tied-def of implicit operand, so this PR checks for that.

>From 92f9ef7f238ed8e2d06f148fe6e7607fdd573794 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 23 Jul 2024 15:14:14 -0700
Subject: [PATCH] [AMDGPU] Correctly insert s_nops for implicit read of SDWA

Change-Id: I4e22bb3764705f328827eb64704720a0d6aa1a9b
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 24 +++++++++
 .../AMDGPU/sdwa-dst-preserve-hazard.mir       | 54 +++++++++++++++++++
 2 files changed, 78 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-dst-preserve-hazard.mir

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index a402fc6d7e611..570fcf63587fd 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -935,6 +935,30 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
           if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
             return true;
         }
+
+        // If the non-impacted bits of a sdwa dst are preserved, then we
+        // read them and must resolve the hazard. SDWA dst preserve is modelled
+        // as a tied-def implicit use
+        if (SIInstrInfo::isSDWA(*VALU)) {
+          if (auto *DstSel =
+                  TII->getNamedOperand(*VALU, AMDGPU::OpName::dst_sel)) {
+            if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
+              return false;
+            if (auto *ThisDst =
+                    TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
+              if (!TRI->regsOverlap(Def, ThisDst->getReg()))
+                return false;
+              for (const MachineOperand &ImplicitOp :
+                   VALU->implicit_operands()) {
+                if (ImplicitOp.isDef())
+                  continue;
+                if (ImplicitOp.isReg() &&
+                    TRI->regsOverlap(Def, ImplicitOp.getReg()))
+                  return true;
+              }
+            }
+          }
+        }
       }
 
       return false;
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-dst-preserve-hazard.mir b/llvm/test/CodeGen/AMDGPU/sdwa-dst-preserve-hazard.mir
new file mode 100644
index 0000000000000..ec3b61c4776ee
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-dst-preserve-hazard.mir
@@ -0,0 +1,54 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=HAZARD %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=NOHAZARD %s
+
+---
+name:            hazard_vcmpx_sdwa_permlane16
+body:            |
+  bb.0:
+  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr5, $vgpr6
+    ; HAZARD-LABEL: name: hazard_vcmpx_sdwa_permlane16
+    ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr5, $vgpr6
+    ; HAZARD-NEXT: {{  $}}
+    ; HAZARD-NEXT: renamable $vgpr7 = COPY $vgpr6, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr6 = COPY $vgpr5, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr4 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr2_vgpr3, 0, 0, implicit $exec
+    ; HAZARD-NEXT: KILL killed renamable $vgpr2, renamable $vgpr3
+    ; HAZARD-NEXT: KILL killed renamable $vgpr0, renamable $vgpr1
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 1, 0, 3, 3, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr1 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 6, 0, 5, 5, implicit $exec
+    ; HAZARD-NEXT: renamable $vgpr0 = V_OR_B32_sdwa 0, killed $vgpr1, 0, killed $vgpr0, 0, 5, 0, 0, 6, implicit $exec
+    ; HAZARD-NEXT: S_NOP 0
+    ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, killed $vgpr4, 0, killed $vgpr5, 0, 1, 2, 6, 1, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; HAZARD-NEXT: GLOBAL_STORE_DWORD killed renamable $vgpr6_vgpr7, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; HAZARD-NEXT: S_ENDPGM 0
+    ;
+    ; NOHAZARD-LABEL: name: hazard_vcmpx_sdwa_permlane16
+    ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr5, $vgpr6
+    ; NOHAZARD-NEXT: {{  $}}
+    ; NOHAZARD-NEXT: renamable $vgpr7 = COPY $vgpr6, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr6 = COPY $vgpr5, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr4 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr2_vgpr3, 0, 0, implicit $exec
+    ; NOHAZARD-NEXT: KILL killed renamable $vgpr2, renamable $vgpr3
+    ; NOHAZARD-NEXT: KILL killed renamable $vgpr0, renamable $vgpr1
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 1, 0, 3, 3, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr1 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 6, 0, 5, 5, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_OR_B32_sdwa 0, killed $vgpr1, 0, killed $vgpr0, 0, 5, 0, 0, 6, implicit $exec
+    ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, killed $vgpr4, 0, killed $vgpr5, 0, 1, 2, 6, 1, implicit $exec, implicit killed $vgpr0(tied-def 0)
+    ; NOHAZARD-NEXT: GLOBAL_STORE_DWORD killed renamable $vgpr6_vgpr7, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; NOHAZARD-NEXT: S_ENDPGM 0
+  renamable $vgpr7 = COPY $vgpr6, implicit $exec
+  renamable $vgpr6 = COPY $vgpr5, implicit $exec
+  renamable $vgpr4 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+  renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr2_vgpr3, 0, 0, implicit $exec
+  KILL killed renamable $vgpr2, renamable $vgpr3
+  KILL killed renamable $vgpr0, renamable $vgpr1
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 1, 0, 3, 3, implicit $exec
+  renamable $vgpr1 = V_ADD_U16_sdwa 0, $vgpr4, 0, $vgpr5, 0, 6, 0, 5, 5, implicit $exec
+  renamable $vgpr0 = V_OR_B32_sdwa 0, killed $vgpr1, 0, killed $vgpr0, 0, 5, 0, 0, 6, implicit $exec
+  renamable $vgpr0 = V_ADD_U16_sdwa 0, killed $vgpr4, 0, killed $vgpr5, 0, 1, 2, 6, 1, implicit $exec, implicit killed $vgpr0(tied-def 0)
+  GLOBAL_STORE_DWORD killed renamable $vgpr6_vgpr7, killed renamable $vgpr0, 0, 0, implicit $exec
+  S_ENDPGM 0
+...



More information about the llvm-commits mailing list