[llvm] [AMDGPU] Enhance s_waitcnt insertion before barrier for gfx12 (PR #90595)

David Stuttard via llvm-commits llvm-commits at lists.llvm.org
Tue Apr 30 04:21:34 PDT 2024


https://github.com/dstutt updated https://github.com/llvm/llvm-project/pull/90595

>From db63bf34748f1ed67b96c9659d04608ef0e0a42b Mon Sep 17 00:00:00 2001
From: David Stuttard <david.stuttard at amd.com>
Date: Tue, 30 Apr 2024 12:14:38 +0100
Subject: [PATCH 1/2] [AMDGPU] Enhance s_waitcnt insertion before barrier for
 gfx12

Code to determine if a waitcnt is required before a barrier instruction only
considered S_BARRIER.
gfx12 adds barrier_signal/wait so need to enhance the existing code to look for
a barrier start (which is just an S_BARRIER for earlier architectures).
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp   |  2 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |  9 ++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll   |  2 ++
 .../AMDGPU/llvm.amdgcn.s.barrier.wait.ll      | 22 +++++++++++++++++++
 4 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 15a1db51c6d78b..072c1ab0c79d0f 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1857,7 +1857,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
   // not, we need to ensure the subtarget is capable of backing off barrier
   // instructions in case there are any outstanding memory operations that may
   // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
-  if (MI.getOpcode() == AMDGPU::S_BARRIER &&
+  if (TII->isBarrierStart(MI.getOpcode()) &&
       !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
     Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
   }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index b314b9b2fb5135..56ea18bcfa3706 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1386,6 +1386,15 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   // This is used if an operand is a 32 bit register but needs to be aligned
   // regardless.
   void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const;
+
+  bool isBarrierStart(uint16_t Opcode) const {
+    return
+      Opcode == AMDGPU::S_BARRIER ||
+      Opcode == AMDGPU::S_BARRIER_SIGNAL_M0 ||
+      Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0 ||
+      Opcode == AMDGPU::S_BARRIER_SIGNAL_IMM ||
+      Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM;
+  }
 };
 
 /// \brief Returns true if a reg:subreg pair P has a TRC class
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
index a7d3115af29bff..47c021769aa56f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
@@ -96,6 +96,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
 ; VARIANT4-NEXT:    s_wait_kmcnt 0x0
 ; VARIANT4-NEXT:    v_xad_u32 v1, v0, -1, s2
 ; VARIANT4-NEXT:    global_store_b32 v3, v0, s[0:1]
+; VARIANT4-NEXT:    s_wait_storecnt 0x0
 ; VARIANT4-NEXT:    s_barrier_signal -1
 ; VARIANT4-NEXT:    s_barrier_wait -1
 ; VARIANT4-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
@@ -142,6 +143,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
 ; VARIANT6-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
 ; VARIANT6-NEXT:    v_sub_nc_u32_e32 v1, s2, v0
 ; VARIANT6-NEXT:    global_store_b32 v5, v0, s[0:1]
+; VARIANT6-NEXT:    s_wait_storecnt 0x0
 ; VARIANT6-NEXT:    s_barrier_signal -1
 ; VARIANT6-NEXT:    s_barrier_wait -1
 ; VARIANT6-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
index 4ab5e97964a857..38a34ec6daf73c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
@@ -12,6 +12,7 @@ define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 {
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT:    s_wait_storecnt 0x0
 ; GCN-NEXT:    s_barrier_signal -1
 ; GCN-NEXT:    s_barrier_wait -1
 ; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
@@ -28,6 +29,7 @@ define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 {
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal -1
 ; GLOBAL-ISEL-NEXT:    s_barrier_wait -1
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
@@ -56,6 +58,7 @@ define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 {
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT:    s_wait_storecnt 0x0
 ; GCN-NEXT:    s_barrier_signal 1
 ; GCN-NEXT:    s_barrier_wait 1
 ; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
@@ -72,6 +75,7 @@ define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 {
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal 1
 ; GLOBAL-ISEL-NEXT:    s_barrier_wait 1
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
@@ -100,6 +104,7 @@ define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 {
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT:    s_wait_storecnt 0x0
 ; GCN-NEXT:    s_barrier_signal 0
 ; GCN-NEXT:    s_barrier_wait 0
 ; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
@@ -116,6 +121,7 @@ define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 {
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal 0
 ; GLOBAL-ISEL-NEXT:    s_barrier_wait 0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
@@ -146,6 +152,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v3, v1, s[0:1]
+; GCN-NEXT:    s_wait_storecnt 0x0
 ; GCN-NEXT:    s_barrier_signal m0
 ; GCN-NEXT:    s_barrier_wait 1
 ; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
@@ -163,6 +170,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal m0
 ; GLOBAL-ISEL-NEXT:    s_barrier_wait 1
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
@@ -192,6 +200,7 @@ define void @test2_s_barrier_signal_var(i32 %arg) {
 ; GCN-NEXT:    v_readfirstlane_b32 s0, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GCN-NEXT:    s_mov_b32 m0, s0
+; GCN-NEXT:    s_wait_storecnt 0x0
 ; GCN-NEXT:    s_barrier_signal m0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -203,6 +212,7 @@ define void @test2_s_barrier_signal_var(i32 %arg) {
 ; GLOBAL-ISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    v_readfirstlane_b32 m0, v0
+; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal m0
 ; GLOBAL-ISEL-NEXT:    s_setpc_b64 s[30:31]
   call void @llvm.amdgcn.s.barrier.signal.var(i32 %arg)
@@ -216,6 +226,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT:    s_wait_storecnt 0x0
 ; GCN-NEXT:    s_barrier_signal_isfirst -1
 ; GCN-NEXT:    s_cselect_b32 s3, s3, s5
 ; GCN-NEXT:    s_cselect_b32 s2, s2, s4
@@ -235,6 +246,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal_isfirst -1
 ; GLOBAL-ISEL-NEXT:    s_cselect_b32 s8, 1, 0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -270,6 +282,7 @@ define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT:    s_wait_storecnt 0x0
 ; GCN-NEXT:    s_barrier_signal_isfirst 1
 ; GCN-NEXT:    s_cselect_b32 s3, s3, s5
 ; GCN-NEXT:    s_cselect_b32 s2, s2, s4
@@ -289,6 +302,7 @@ define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal_isfirst 1
 ; GLOBAL-ISEL-NEXT:    s_cselect_b32 s8, 1, 0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -324,6 +338,7 @@ define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT:    s_wait_storecnt 0x0
 ; GCN-NEXT:    s_barrier_signal_isfirst 1
 ; GCN-NEXT:    s_cselect_b32 s3, s3, s5
 ; GCN-NEXT:    s_cselect_b32 s2, s2, s4
@@ -343,6 +358,7 @@ define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal_isfirst 1
 ; GLOBAL-ISEL-NEXT:    s_cselect_b32 s8, 1, 0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -379,6 +395,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) %
 ; GCN-NEXT:    s_mov_b32 m0, 1
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT:    s_wait_storecnt 0x0
 ; GCN-NEXT:    s_barrier_signal_isfirst m0
 ; GCN-NEXT:    s_cselect_b32 s3, s3, s5
 ; GCN-NEXT:    s_cselect_b32 s2, s2, s4
@@ -399,6 +416,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) %
 ; GLOBAL-ISEL-NEXT:    s_mov_b32 m0, 1
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal_isfirst m0
 ; GLOBAL-ISEL-NEXT:    s_cselect_b32 s8, 1, 0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -444,6 +462,7 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa
 ; GCN-NEXT:    v_add_co_u32 v7, vcc_lo, v7, v9
 ; GCN-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo
 ; GCN-NEXT:    global_store_b32 v[7:8], v10, off
+; GCN-NEXT:    s_wait_storecnt 0x0
 ; GCN-NEXT:    s_barrier_signal_isfirst m0
 ; GCN-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GCN-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3
@@ -470,6 +489,7 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa
 ; GLOBAL-ISEL-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo
 ; GLOBAL-ISEL-NEXT:    v_mov_b32_e32 v9, 0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v[7:8], v9, off
+; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal_isfirst m0
 ; GLOBAL-ISEL-NEXT:    s_cselect_b32 s0, 1, 0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -1339,6 +1359,7 @@ define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 {
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT:    s_wait_storecnt 0x0
 ; GCN-NEXT:    s_barrier_signal -1
 ; GCN-NEXT:    s_barrier_wait -1
 ; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
@@ -1355,6 +1376,7 @@ define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 {
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal -1
 ; GLOBAL-ISEL-NEXT:    s_barrier_wait -1
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]

>From 93fe94ef51ea8fc3a4a693cb8566819b8ee584e3 Mon Sep 17 00:00:00 2001
From: David Stuttard <david.stuttard at amd.com>
Date: Tue, 30 Apr 2024 12:20:08 +0100
Subject: [PATCH 2/2] Formatting update

---
 llvm/lib/Target/AMDGPU/SIInstrInfo.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 56ea18bcfa3706..ed421644b00c27 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1388,12 +1388,11 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const;
 
   bool isBarrierStart(uint16_t Opcode) const {
-    return
-      Opcode == AMDGPU::S_BARRIER ||
-      Opcode == AMDGPU::S_BARRIER_SIGNAL_M0 ||
-      Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0 ||
-      Opcode == AMDGPU::S_BARRIER_SIGNAL_IMM ||
-      Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM;
+    return Opcode == AMDGPU::S_BARRIER ||
+           Opcode == AMDGPU::S_BARRIER_SIGNAL_M0 ||
+           Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0 ||
+           Opcode == AMDGPU::S_BARRIER_SIGNAL_IMM ||
+           Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM;
   }
 };
 



More information about the llvm-commits mailing list