[llvm] [AMDGPU] Update hasUnwantedEffectsWhenEXECEmpty (PR #97982)

Carl Ritson via llvm-commits llvm-commits at lists.llvm.org
Sun Jul 7 19:58:28 PDT 2024


https://github.com/perlfu created https://github.com/llvm/llvm-project/pull/97982

Add barriers and s_wait_event to hasUnwantedEffectsWhenEXECEmpty.
Add a comment documenting the current expected use of the function.

>From e072f1f8da7031f58a3d9918659d3fa1c68af55f Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Thu, 4 Jul 2024 14:10:33 +0900
Subject: [PATCH 1/2] [AMDGPU] Update EXECZ retention in SIPreEmitPeephole for
 GFX10/12

The check to maintain EXECZ branches only checks S_WAITCNT.
Add handling for new waitcnt instructions in GFX10 and GFX12.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |  23 ++
 llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp  |   2 +-
 .../CodeGen/AMDGPU/insert-skips-gfx10.mir     | 186 +++++++++++
 .../CodeGen/AMDGPU/insert-skips-gfx12.mir     | 302 ++++++++++++++++++
 4 files changed, 512 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/insert-skips-gfx10.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 1e2b687854c77a..b723deb9543cdc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -967,6 +967,29 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
     }
   }
 
+  bool isWaitcnt(unsigned Opcode) const {
+    switch (getNonSoftWaitcntOpcode(Opcode)) {
+    case AMDGPU::S_WAITCNT:
+    case AMDGPU::S_WAITCNT_VSCNT:
+    case AMDGPU::S_WAITCNT_VMCNT:
+    case AMDGPU::S_WAITCNT_EXPCNT:
+    case AMDGPU::S_WAITCNT_LGKMCNT:
+    case AMDGPU::S_WAIT_LOADCNT:
+    case AMDGPU::S_WAIT_LOADCNT_DSCNT:
+    case AMDGPU::S_WAIT_STORECNT:
+    case AMDGPU::S_WAIT_STORECNT_DSCNT:
+    case AMDGPU::S_WAIT_SAMPLECNT:
+    case AMDGPU::S_WAIT_BVHCNT:
+    case AMDGPU::S_WAIT_EXPCNT:
+    case AMDGPU::S_WAIT_DSCNT:
+    case AMDGPU::S_WAIT_KMCNT:
+    case AMDGPU::S_WAIT_IDLE:
+      return true;
+    default:
+      return false;
+    }
+  }
+
   bool isVGPRCopy(const MachineInstr &MI) const {
     assert(isCopyInstr(MI));
     Register Dest = MI.getOperand(0).getReg();
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 875bccb208c846..1334029544f999 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -328,7 +328,7 @@ bool SIPreEmitPeephole::mustRetainExeczBranch(
 
       // These instructions are potentially expensive even if EXEC = 0.
       if (TII->isSMRD(MI) || TII->isVMEM(MI) || TII->isFLAT(MI) ||
-          TII->isDS(MI) || MI.getOpcode() == AMDGPU::S_WAITCNT)
+          TII->isDS(MI) || TII->isWaitcnt(MI.getOpcode()))
         return true;
 
       ++NumInstr;
diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-gfx10.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx10.mir
new file mode 100644
index 00000000000000..1d3132dbe2af2f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx10.mir
@@ -0,0 +1,186 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass si-pre-emit-peephole -amdgpu-skip-threshold=10 -verify-machineinstrs  %s -o - | FileCheck %s
+
+---
+name: skip_waitcnt_vscnt
+body: |
+  ; CHECK-LABEL: name: skip_waitcnt_vscnt
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   S_WAITCNT_VSCNT $sgpr_null, 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    S_WAITCNT_VSCNT $sgpr_null, 0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name: skip_waitcnt_expcnt
+body: |
+  ; CHECK-LABEL: name: skip_waitcnt_expcnt
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   S_WAITCNT_EXPCNT $sgpr_null, 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    S_WAITCNT_EXPCNT $sgpr_null, 0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name: skip_waitcnt_vmcnt
+body: |
+  ; CHECK-LABEL: name: skip_waitcnt_vmcnt
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   S_WAITCNT_VMCNT $sgpr_null, 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    S_WAITCNT_VMCNT $sgpr_null, 0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name: skip_waitcnt_lgkmcnt
+body: |
+  ; CHECK-LABEL: name: skip_waitcnt_lgkmcnt
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   S_WAITCNT_LGKMCNT $sgpr_null, 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    S_WAITCNT_LGKMCNT $sgpr_null, 0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name: skip_wait_idle
+body: |
+  ; CHECK-LABEL: name: skip_wait_idle
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   S_WAIT_IDLE
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    S_WAIT_IDLE
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name: skip_bvh
+body: |
+  ; CHECK-LABEL: name: skip_bvh
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx11 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14 = IMPLICIT_DEF
+    $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+    $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx11 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
+
+  bb.2:
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir
new file mode 100644
index 00000000000000..c0b839d218a95f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir
@@ -0,0 +1,302 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass si-pre-emit-peephole -amdgpu-skip-threshold=10 -verify-machineinstrs  %s -o - | FileCheck %s
+
+---
+name: skip_wait_loadcnt
+body: |
+  ; CHECK-LABEL: name: skip_wait_loadcnt
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   S_WAIT_LOADCNT 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    S_WAIT_LOADCNT 0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name: skip_wait_loadcnt_dscnt
+body: |
+  ; CHECK-LABEL: name: skip_wait_loadcnt_dscnt
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   S_WAIT_LOADCNT_DSCNT 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    S_WAIT_LOADCNT_DSCNT 0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name: skip_wait_storecnt
+body: |
+  ; CHECK-LABEL: name: skip_wait_storecnt
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   S_WAIT_STORECNT 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    S_WAIT_STORECNT 0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name: skip_wait_storecnt_dscnt
+body: |
+  ; CHECK-LABEL: name: skip_wait_storecnt_dscnt
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   S_WAIT_STORECNT_DSCNT 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    S_WAIT_STORECNT_DSCNT 0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name: skip_wait_samplecnt
+body: |
+  ; CHECK-LABEL: name: skip_wait_samplecnt
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   S_WAIT_SAMPLECNT 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    S_WAIT_SAMPLECNT 0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name: skip_wait_bvhcnt
+body: |
+  ; CHECK-LABEL: name: skip_wait_bvhcnt
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   S_WAIT_BVHCNT 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    S_WAIT_BVHCNT 0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name: skip_wait_expcnt
+body: |
+  ; CHECK-LABEL: name: skip_wait_expcnt
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   S_WAIT_EXPCNT 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    S_WAIT_EXPCNT 0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name: skip_wait_dscnt
+body: |
+  ; CHECK-LABEL: name: skip_wait_dscnt
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   S_WAIT_DSCNT 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    S_WAIT_DSCNT 0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name: skip_wait_kmcnt
+body: |
+  ; CHECK-LABEL: name: skip_wait_kmcnt
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   S_WAIT_KMCNT 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    S_WAIT_KMCNT 0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name: skip_wait_idle
+body: |
+  ; CHECK-LABEL: name: skip_wait_idle
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   S_WAIT_IDLE
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    S_WAIT_IDLE
+
+  bb.2:
+    S_ENDPGM 0
+...

>From 77103196d63c905c45172495b25be1300f11a4d8 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Mon, 8 Jul 2024 11:22:48 +0900
Subject: [PATCH 2/2] [AMDGPU] Update hasUnwantedEffectsWhenEXECEmpty

Add barriers and s_wait_event to hasUnwantedEffectsWhenEXECEmpty.
Add a comment documenting the current expected use of the function.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  14 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |   8 +
 .../CodeGen/AMDGPU/insert-skips-gfx10.mir     |  30 ++
 .../CodeGen/AMDGPU/insert-skips-gfx12.mir     | 308 ++++++++++++++++++
 4 files changed, 359 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index cc1b9ac0c9ecda..a2cb3834643227 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4118,6 +4118,13 @@ bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
 }
 
 bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
+  // This function is used to determine if an instruction can be safely
+  // executed under EXECZ without hardware error, indeterminate results,
+  // and/or visible effects on future vector execution or outside the shader.
+  // Note: as of 2024 the only use of this is SIPreEmitPeephole where it is
+  // used in removing branches over short EXECZ sequences.
+  // As such it embeds certain assumptions which may not apply in every case
+  // of EXECZ execution.
   unsigned Opcode = MI.getOpcode();
 
   if (MI.mayStore() && isSMRD(MI))
@@ -4136,12 +4143,17 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
   if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
       isEXP(Opcode) ||
       Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP ||
-      Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER)
+      Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER ||
+      Opcode == AMDGPU::S_WAIT_EVENT)
     return true;
 
   if (MI.isCall() || MI.isInlineAsm())
     return true; // conservative assumption
 
+  // Assume that barrier interactions are only intended with active lanes.
+  if (isBarrierRelated(Opcode))
+    return true;
+
   // A mode change is a scalar operation that influences vector instructions.
   if (modifiesModeRegister(MI))
     return true;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index b723deb9543cdc..bee24b3a7a91b3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -936,6 +936,14 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
            Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM;
   }
 
+  bool isBarrierRelated(unsigned Opcode) const {
+    return isBarrierStart(Opcode) || Opcode == AMDGPU::S_BARRIER_WAIT ||
+           Opcode == AMDGPU::S_BARRIER_INIT_M0 ||
+           Opcode == AMDGPU::S_BARRIER_INIT_IMM ||
+           Opcode == AMDGPU::S_BARRIER_JOIN_IMM ||
+           Opcode == AMDGPU::S_BARRIER_LEAVE;
+  }
+
   static bool doesNotReadTiedSource(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::TiedSourceNotRead;
   }
diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-gfx10.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx10.mir
index 1d3132dbe2af2f..b4ed3cafbacb5f 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-skips-gfx10.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx10.mir
@@ -184,3 +184,33 @@ body: |
   bb.2:
     S_ENDPGM 0
 ...
+
+---
+name: skip_barrier
+body: |
+  ; CHECK-LABEL: name: skip_barrier
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   S_BARRIER
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    S_BARRIER
+
+  bb.2:
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir
index c0b839d218a95f..2d092974ac566f 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir
@@ -300,3 +300,311 @@ body: |
   bb.2:
     S_ENDPGM 0
 ...
+
+---
+name: skip_wait_event
+body: |
+  ; CHECK-LABEL: name: skip_wait_event
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   S_WAIT_EVENT 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    S_WAIT_EVENT 0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name: skip_barrier_signal_imm
+body: |
+  ; CHECK-LABEL: name: skip_barrier_signal_imm
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   S_BARRIER_SIGNAL_IMM -1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    S_BARRIER_SIGNAL_IMM -1
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name: skip_barrier_signal_isfirst_imm
+body: |
+  ; CHECK-LABEL: name: skip_barrier_signal_isfirst_imm
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name: skip_barrier_signal_m0
+body: |
+  ; CHECK-LABEL: name: skip_barrier_signal_m0
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   $m0 = S_MOV_B32 -1
+  ; CHECK-NEXT:   S_BARRIER_SIGNAL_M0 implicit $m0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    $m0 = S_MOV_B32 -1
+    S_BARRIER_SIGNAL_M0 implicit $m0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name: skip_barrier_signal_isfirst_m0
+body: |
+  ; CHECK-LABEL: name: skip_barrier_signal_isfirst_m0
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   $m0 = S_MOV_B32 -1
+  ; CHECK-NEXT:   S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    $m0 = S_MOV_B32 -1
+    S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name: skip_barrier_wait
+body: |
+  ; CHECK-LABEL: name: skip_barrier_wait
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   S_BARRIER_WAIT -1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    S_BARRIER_WAIT -1
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name: skip_barrier_init_imm
+body: |
+  ; CHECK-LABEL: name: skip_barrier_init_imm
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   $m0 = S_MOV_B32 -1
+  ; CHECK-NEXT:   S_BARRIER_INIT_IMM -1, implicit $m0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    $m0 = S_MOV_B32 -1
+    S_BARRIER_INIT_IMM -1, implicit $m0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name: skip_barrier_init_m0
+body: |
+  ; CHECK-LABEL: name: skip_barrier_init_m0
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   $m0 = S_MOV_B32 -1
+  ; CHECK-NEXT:   S_BARRIER_INIT_M0 implicit $m0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    $m0 = S_MOV_B32 -1
+    S_BARRIER_INIT_M0 implicit $m0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name: skip_barrier_join_imm
+body: |
+  ; CHECK-LABEL: name: skip_barrier_join_imm
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   S_BARRIER_JOIN_IMM -1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    S_BARRIER_JOIN_IMM -1
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name: skip_barrier_leave
+body: |
+  ; CHECK-LABEL: name: skip_barrier_leave
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   S_BARRIER_LEAVE implicit-def $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    S_BARRIER_LEAVE implicit-def $scc
+
+  bb.2:
+    S_ENDPGM 0
+...



More information about the llvm-commits mailing list