[llvm] d6edc1a - [AMDGPU] Reenable BackOffBarrier on GFX11/12 (#155370)

via llvm-commits llvm-commits at lists.llvm.org
Tue Sep 2 00:37:47 PDT 2025


Author: Pierre van Houtryve
Date: 2025-09-02T09:37:43+02:00
New Revision: d6edc1a96f6ba06c745bbc733f9e7ce0b44ab71c

URL: https://github.com/llvm/llvm-project/commit/d6edc1a96f6ba06c745bbc733f9e7ce0b44ab71c
DIFF: https://github.com/llvm/llvm-project/commit/d6edc1a96f6ba06c745bbc733f9e7ce0b44ab71c.diff

LOG: [AMDGPU] Reenable BackOffBarrier on GFX11/12 (#155370)

Re-enable it by adding a wait on vm_vsrc before every barrier "start"
instruction in GFX10/11/12 CU mode.

This is a less strong wait than what we do without BackOffBarrier, thus
this shouldn't introduce
any new guarantees that can be abused, instead it relaxes the guarantees
we have now to the bare
minimum needed to support the behavior users want (fence release +
barrier works).

There is an exact memory model in the works which will be documented
separately.

Added: 
    llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPU.td
    llvm/lib/Target/AMDGPU/SIInstrInfo.h
    llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
    llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll
    llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
    llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index edd3ce72d7df3..1038797374de3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1902,6 +1902,7 @@ def FeatureISAVersion10_3_Generic: FeatureSet<
 
 def FeatureISAVersion11_Common : FeatureSet<
   [FeatureGFX11,
+   FeatureBackOffBarrier,
    FeatureLDSBankCount32,
    FeatureDLInsts,
    FeatureDot5Insts,
@@ -1985,6 +1986,7 @@ def FeatureISAVersion11_5_3 : FeatureSet<
 
 def FeatureISAVersion12 : FeatureSet<
   [FeatureGFX12,
+   FeatureBackOffBarrier,
    FeatureAddressableLocalMemorySize65536,
    FeatureLDSBankCount32,
    FeatureDLInsts,

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index fdbd9ce4a66bf..f7c7bb509c9ef 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -983,13 +983,19 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
     return MI.getDesc().TSFlags & SIInstrFlags::IsNeverUniform;
   }
 
-  bool isBarrier(unsigned Opcode) const {
+  // Check to see if opcode is for a barrier start. Pre gfx12 this is just the
+  // S_BARRIER, but after support for S_BARRIER_SIGNAL* / S_BARRIER_WAIT we want
+  // to check for the barrier start (S_BARRIER_SIGNAL*)
+  bool isBarrierStart(unsigned Opcode) const {
     return Opcode == AMDGPU::S_BARRIER ||
            Opcode == AMDGPU::S_BARRIER_SIGNAL_M0 ||
            Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0 ||
            Opcode == AMDGPU::S_BARRIER_SIGNAL_IMM ||
-           Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM ||
-           Opcode == AMDGPU::S_BARRIER_WAIT ||
+           Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM;
+  }
+
+  bool isBarrier(unsigned Opcode) const {
+    return isBarrierStart(Opcode) || Opcode == AMDGPU::S_BARRIER_WAIT ||
            Opcode == AMDGPU::S_BARRIER_INIT_M0 ||
            Opcode == AMDGPU::S_BARRIER_INIT_IMM ||
            Opcode == AMDGPU::S_BARRIER_JOIN_IMM ||

diff  --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 53f554eccb1fb..3006eaf468bdd 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -359,6 +359,12 @@ class SICacheControl {
                              bool IsCrossAddrSpaceOrdering,
                              Position Pos) const = 0;
 
+  /// Inserts any necessary instructions before the barrier start instruction
+  /// \p MI in order to support pairing of barriers and fences.
+  virtual bool insertBarrierStart(MachineBasicBlock::iterator &MI) const {
+    return false;
+  };
+
   /// Virtual destructor to allow derivations to be deleted.
   virtual ~SICacheControl() = default;
 };
@@ -547,6 +553,8 @@ class SIGfx10CacheControl : public SIGfx7CacheControl {
                      SIAtomicScope Scope,
                      SIAtomicAddrSpace AddrSpace,
                      Position Pos) const override;
+
+  bool insertBarrierStart(MachineBasicBlock::iterator &MI) const override;
 };
 
 class SIGfx11CacheControl : public SIGfx10CacheControl {
@@ -2169,6 +2177,21 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
   return Changed;
 }
 
+bool SIGfx10CacheControl::insertBarrierStart(
+    MachineBasicBlock::iterator &MI) const {
+  // We need to wait on vm_vsrc so barriers can pair with fences in GFX10+ CU
+  // mode. This is because a CU mode release fence does not emit any wait, which
+  // is fine when only dealing with vmem, but isn't sufficient in the presence
+  // of barriers which do not go through vmem.
+  if (!ST.isCuModeEnabled())
+    return false;
+
+  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+          TII->get(AMDGPU::S_WAITCNT_DEPCTR))
+      .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
+  return true;
+}
+
 bool SIGfx11CacheControl::enableLoadCacheBypass(
     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
     SIAtomicAddrSpace AddrSpace) const {
@@ -2840,7 +2863,8 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) {
   bool Changed = false;
 
   SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>());
-  CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  CC = SICacheControl::create(ST);
 
   for (auto &MBB : MF) {
     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
@@ -2860,6 +2884,11 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) {
         MI = II->getIterator();
       }
 
+      if (ST.getInstrInfo()->isBarrierStart(MI->getOpcode())) {
+        Changed |= CC->insertBarrierStart(MI);
+        continue;
+      }
+
       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
         continue;
 

diff  --git a/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll b/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll
index b584f6df647ce..122e683f79664 100644
--- a/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll
+++ b/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll
@@ -54,9 +54,8 @@ define void @back_off_barrier_no_fence(ptr %in, ptr %out) #0 {
 ; GFX11-BACKOFF:       ; %bb.0:
 ; GFX11-BACKOFF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-BACKOFF-NEXT:    flat_load_b32 v0, v[0:1]
-; GFX11-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-BACKOFF-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-BACKOFF-NEXT:    s_barrier
+; GFX11-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-BACKOFF-NEXT:    flat_store_b32 v[2:3], v0
 ; GFX11-BACKOFF-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-BACKOFF-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
index d23509b5aa812..b91963f08681c 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
@@ -150,6 +150,7 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
 ; GFX10CU-NEXT:    buffer_load_dword v0, s[8:11], 0 offen lds
 ; GFX10CU-NEXT:    v_mov_b32_e32 v0, s13
 ; GFX10CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10CU-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10CU-NEXT:    s_barrier
 ; GFX10CU-NEXT:    ds_read_b32 v0, v0
 ; GFX10CU-NEXT:    s_waitcnt lgkmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
index af8f5103f6d5d..e8b8d05ab26ad 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
@@ -213,9 +213,9 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffc, v0
 ; GFX11-NEXT:    ds_load_b32 v1, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_barrier
 ; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v0, s0, s0, v0
 ; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
new file mode 100644
index 0000000000000..e921f581c00a7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
@@ -0,0 +1,183 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
+
+define amdgpu_kernel void @test_s_barrier() {
+; GFX10-WGP-LABEL: test_s_barrier:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_barrier
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: test_s_barrier:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-CU-NEXT:    s_barrier
+; GFX10-CU-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: test_s_barrier:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_barrier
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: test_s_barrier:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX11-CU-NEXT:    s_barrier
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: test_s_barrier:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_barrier_signal -1
+; GFX12-WGP-NEXT:    s_barrier_wait -1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: test_s_barrier:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_alu 0xffe3
+; GFX12-CU-NEXT:    s_barrier_signal -1
+; GFX12-CU-NEXT:    s_barrier_wait -1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: test_s_barrier:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_alu 0xffe3
+; GFX1250-NEXT:    s_barrier_signal -1
+; GFX1250-NEXT:    s_barrier_wait -1
+; GFX1250-NEXT:    s_endpgm
+entry:
+  call void @llvm.amdgcn.s.barrier()
+  ret void
+}
+
+define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
+; GFX10-WGP-LABEL: test_s_barrier_workgroup_fence:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_barrier
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: test_s_barrier_workgroup_fence:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-CU-NEXT:    s_barrier
+; GFX10-CU-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: test_s_barrier_workgroup_fence:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    s_barrier
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: test_s_barrier_workgroup_fence:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX11-CU-NEXT:    s_barrier
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: test_s_barrier_workgroup_fence:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    s_barrier_signal -1
+; GFX12-WGP-NEXT:    s_barrier_wait -1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: test_s_barrier_workgroup_fence:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_alu 0xffe3
+; GFX12-CU-NEXT:    s_barrier_signal -1
+; GFX12-CU-NEXT:    s_barrier_wait -1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: test_s_barrier_workgroup_fence:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_wait_alu 0xffe3
+; GFX1250-NEXT:    s_barrier_signal -1
+; GFX1250-NEXT:    s_barrier_wait -1
+; GFX1250-NEXT:    s_endpgm
+entry:
+  fence syncscope("workgroup") release
+  call void @llvm.amdgcn.s.barrier()
+  ret void
+}
+
+define amdgpu_kernel void @test_s_barrier_agent_fence() {
+; GFX10-WGP-LABEL: test_s_barrier_agent_fence:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_barrier
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: test_s_barrier_agent_fence:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-CU-NEXT:    s_barrier
+; GFX10-CU-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: test_s_barrier_agent_fence:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    s_barrier
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: test_s_barrier_agent_fence:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX11-CU-NEXT:    s_barrier
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: test_s_barrier_agent_fence:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    s_barrier_signal -1
+; GFX12-WGP-NEXT:    s_barrier_wait -1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: test_s_barrier_agent_fence:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_alu 0xffe3
+; GFX12-CU-NEXT:    s_barrier_signal -1
+; GFX12-CU-NEXT:    s_barrier_wait -1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: test_s_barrier_agent_fence:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_alu 0xffe3
+; GFX1250-NEXT:    s_barrier_signal -1
+; GFX1250-NEXT:    s_barrier_wait -1
+; GFX1250-NEXT:    s_endpgm
+entry:
+  fence syncscope("agent") release
+  call void @llvm.amdgcn.s.barrier()
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir
index 292091ad6e1e7..bb47392e7b7d7 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir
@@ -62,7 +62,7 @@ body:             |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: S_WAITCNT 0
     ; GFX11-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
-    ; GFX11-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
+    ; GFX11-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 1
     ; GFX11-NEXT: S_BARRIER
     ; GFX11-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     ; GFX11-NEXT: S_WAITCNT 7
@@ -176,7 +176,7 @@ body:             |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: S_WAITCNT 0
     ; GFX11-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
-    ; GFX11-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
+    ; GFX11-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 1
     ; GFX11-NEXT: S_BARRIER
     ; GFX11-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     ; GFX11-NEXT: S_WAITCNT 7


        


More information about the llvm-commits mailing list