[llvm] [AMDGPU] introduce S_WAITCNT_FENCE_soft emitted by memory legalizer (PR #150167)

Tue Jul 22 22:26:27 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-llvm-globalisel

Author: Sameer Sahasrabuddhe (ssahasra)

<details>
<summary>Changes</summary>

The new instruction represents any wait counts resulting from a fence, which the
memory legalizer cannot determine on its own. After lowering a fence to the
appropriate cache operations and any necessary S_WAIT* instructions, the
legalizer hands over further work to SIInsertWaitcnts using this instruction as
a placeholder. The waitcnt inserter can use additional information.

Currently this is used to implement efficient waitcnts for direct loads to LDS,
based on the ordering, scope and addrspace specified on the soft fence.

---

Patch is 57.03 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/150167.diff


8 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIDefines.h (+33) 
- (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+34) 
- (modified) llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (+39-32) 
- (modified) llvm/lib/Target/AMDGPU/SOPInstructions.td (+6) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll (+36) 
- (added) llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir (+153) 
- (added) llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll (+543) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir (+12) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 3902d4c3b1027..9d30951cac1a3 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_SIDEFINES_H
 #define LLVM_LIB_TARGET_AMDGPU_SIDEFINES_H
 
+#include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/MC/MCInstrDesc.h"
 
 namespace llvm {
@@ -419,6 +420,38 @@ enum CPol {
 
 } // namespace CPol
 
+/// The atomic synchronization scopes supported by the AMDGPU target.
+enum class SIAtomicScope {
+  NONE,
+  SINGLETHREAD,
+  WAVEFRONT,
+  WORKGROUP,
+  AGENT,
+  SYSTEM
+};
+
+/// The distinct address spaces supported by the AMDGPU target for
+/// atomic memory operation. Can be ORed together.
+enum class SIAtomicAddrSpace {
+  NONE = 0u,
+  GLOBAL = 1u << 0,
+  LDS = 1u << 1,
+  SCRATCH = 1u << 2,
+  GDS = 1u << 3,
+  OTHER = 1u << 4,
+
+  /// The address spaces that can be accessed by a FLAT instruction.
+  FLAT = GLOBAL | LDS | SCRATCH,
+
+  /// The address spaces that support atomic instructions.
+  ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
+
+  /// All address spaces.
+  ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
+
+  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
+};
+
 namespace SendMsg { // Encoding of SIMM16 used in s_sendmsg* insns.
 
 enum Id { // Message ID, width(4) [3:0].
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 9faf4974e3fd6..ca1c534896b5a 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -37,7 +37,9 @@
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/Support/DebugCounter.h"
 #include "llvm/TargetParser/TargetParser.h"
+
 using namespace llvm;
+using namespace AMDGPU;
 
 #define DEBUG_TYPE "si-insert-waitcnts"
 
@@ -1381,6 +1383,32 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
         Modified = true;
       } else
         WaitcntInstr = &II;
+    } else if (Opcode == AMDGPU::S_WAITCNT_FENCE_soft) {
+      // Each direct load to LDS is also a store to LDS, but we do not have a
+      // separate counter for it. Instead these operations increment LOAD_CNT
+      // and need to be waited for at a release fence. So we treat a release
+      // fence as if it depends on any previous LDS DMA stores.
+      unsigned Ordering =
+          TII->getNamedOperand(II, AMDGPU::OpName::Ordering)->getImm();
+      unsigned Scope =
+          TII->getNamedOperand(II, AMDGPU::OpName::Scope)->getImm();
+      unsigned AddrSpace =
+          TII->getNamedOperand(II, AMDGPU::OpName::AddrSpace)->getImm();
+      if (isReleaseOrStronger((AtomicOrdering)Ordering) &&
+          Scope >= (unsigned)AMDGPU::SIAtomicScope::WORKGROUP &&
+          any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) {
+        LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_FENCE_soft: " << II
+                          << "Before: " << Wait.LoadCnt << '\n';);
+        ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait);
+        LLVM_DEBUG(dbgs() << "After: " << Wait.LoadCnt << '\n';);
+      }
+      // It is possible (but unlikely) that this is the only wait instruction,
+      // in which case, we exit this loop without a WaitcntInstr to consume
+      // `Wait`. But that works because `Wait` was passed in by reference, and
+      // the callee eventually calls createNewWaitcnt on it. We test this
+      // possibility in an articial MIR test since such a situation cannot be
+      // recreated by running the memory legalizer.
+      II.eraseFromParent();
     } else {
       assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
       assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
@@ -1552,6 +1580,11 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
         ScoreBrackets.simplifyWaitcnt(OldWait);
       Wait = Wait.combined(OldWait);
       UpdatableInstr = &CombinedStoreDsCntInstr;
+    } else if (Opcode == AMDGPU::S_WAITCNT_FENCE_soft) {
+      // Architectures higher than GFX10 do not have direct loads to
+      // LDS, so no work required here yet.
+      II.eraseFromParent();
+      continue;
     } else {
       std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
       assert(CT.has_value());
@@ -2444,6 +2477,7 @@ static bool isWaitInstr(MachineInstr &Inst) {
           Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
          Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
          Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
+         Opcode == AMDGPU::S_WAITCNT_FENCE_soft ||
          counterTypeForInstr(Opcode).has_value();
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 3212060f303a5..6e998098a1ab2 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -57,38 +57,6 @@ enum class Position {
   AFTER
 };
 
-/// The atomic synchronization scopes supported by the AMDGPU target.
-enum class SIAtomicScope {
-  NONE,
-  SINGLETHREAD,
-  WAVEFRONT,
-  WORKGROUP,
-  AGENT,
-  SYSTEM
-};
-
-/// The distinct address spaces supported by the AMDGPU target for
-/// atomic memory operation. Can be ORed together.
-enum class SIAtomicAddrSpace {
-  NONE = 0u,
-  GLOBAL = 1u << 0,
-  LDS = 1u << 1,
-  SCRATCH = 1u << 2,
-  GDS = 1u << 3,
-  OTHER = 1u << 4,
-
-  /// The address spaces that can be accessed by a FLAT instruction.
-  FLAT = GLOBAL | LDS | SCRATCH,
-
-  /// The address spaces that support atomic instructions.
-  ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
-
-  /// All address spaces.
-  ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
-
-  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
-};
-
 class SIMemOpInfo final {
 private:
 
@@ -1160,6 +1128,19 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
     Changed = true;
   }
 
+  // Emit a soft wait count as a place holder for SIInsertWaitcnts, which will
+  // later add additional waits. To minimize clutter, we do this only when
+  // required. For now this just means a release operation at workgroup scope
+  // that synchronizes LDS, required by direct loads to LDS.
+  if (isReleaseOrStronger(Order) && Scope == SIAtomicScope::WORKGROUP &&
+      any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) {
+    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_FENCE_soft))
+        .addImm((unsigned)Order)
+        .addImm((unsigned)Scope)
+        .addImm((unsigned)AddrSpace);
+    Changed = true;
+  }
+
   if (Pos == Position::AFTER)
     --MI;
 
@@ -2068,6 +2049,19 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
     Changed = true;
   }
 
+  // Emit a soft wait count as a place holder for SIInsertWaitcnts, which will
+  // later add additional waits. To minimize clutter, we do this only when
+  // required. For now this just means a release operation at workgroup scope
+  // that synchronizes LDS, required by direct loads to LDS.
+  if (isReleaseOrStronger(Order) && Scope == SIAtomicScope::WORKGROUP &&
+      any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) {
+    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_FENCE_soft))
+        .addImm((unsigned)Order)
+        .addImm((unsigned)Scope)
+        .addImm((unsigned)AddrSpace);
+    Changed = true;
+  }
+
   if (VSCnt) {
     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
         .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
@@ -2385,6 +2379,19 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
     Changed = true;
   }
 
+  // Emit a soft wait count as a place holder for SIInsertWaitcnts, which will
+  // later add additional waits. To minimize clutter, we do this only when
+  // required. For now this just means a release operation at workgroup scope
+  // that synchronizes LDS, required by direct loads to LDS.
+  if (isReleaseOrStronger(Order) && Scope == SIAtomicScope::WORKGROUP &&
+      any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) {
+    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_FENCE_soft))
+        .addImm((unsigned)Order)
+        .addImm((unsigned)Scope)
+        .addImm((unsigned)AddrSpace);
+    Changed = true;
+  }
+
   if (Pos == Position::AFTER)
     --MI;
 
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index e103ccc2f00e6..df1a7bd4424bc 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1621,6 +1621,12 @@ let OtherPredicates = [HasImageInsts] in {
   def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
 }
 
+def S_WAITCNT_FENCE_soft : SPseudoInstSI <
+   (outs), (ins i32imm:$Ordering, i32imm:$Scope, i32imm:$AddrSpace)> {
+   let hasSideEffects = 0;
+   let UseNamedOperandTable = 1;
+}
+
 def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
     [(int_amdgcn_s_sethalt timm:$simm16)]>;
 def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
index 66037615f0ba0..1f01c64de546c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
@@ -536,30 +536,36 @@ entry:
 define amdgpu_kernel void @workgroup_one_as_release() #0 {
   ; GFX6-LABEL: name: workgroup_one_as_release
   ; GFX6: bb.0.entry:
+  ; GFX6-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX6-NEXT:   S_ENDPGM 0
   ;
   ; GFX8-LABEL: name: workgroup_one_as_release
   ; GFX8: bb.0.entry:
+  ; GFX8-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX8-NEXT:   S_ENDPGM 0
   ;
   ; GFX10WGP-LABEL: name: workgroup_one_as_release
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10WGP-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: workgroup_one_as_release
   ; GFX10CU: bb.0.entry:
+  ; GFX10CU-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_one_as_release
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11WGP-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: workgroup_one_as_release
   ; GFX11CU: bb.0.entry:
+  ; GFX11CU-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("workgroup-one-as") release
@@ -569,32 +575,38 @@ entry:
 define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 {
   ; GFX6-LABEL: name: workgroup_one_as_acq_rel
   ; GFX6: bb.0.entry:
+  ; GFX6-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX6-NEXT:   S_ENDPGM 0
   ;
   ; GFX8-LABEL: name: workgroup_one_as_acq_rel
   ; GFX8: bb.0.entry:
+  ; GFX8-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX8-NEXT:   S_ENDPGM 0
   ;
   ; GFX10WGP-LABEL: name: workgroup_one_as_acq_rel
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10WGP-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: workgroup_one_as_acq_rel
   ; GFX10CU: bb.0.entry:
+  ; GFX10CU-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_one_as_acq_rel
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11WGP-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: workgroup_one_as_acq_rel
   ; GFX11CU: bb.0.entry:
+  ; GFX11CU-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("workgroup-one-as") acq_rel
@@ -604,32 +616,38 @@ entry:
 define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 {
   ; GFX6-LABEL: name: workgroup_one_as_seq_cst
   ; GFX6: bb.0.entry:
+  ; GFX6-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX6-NEXT:   S_ENDPGM 0
   ;
   ; GFX8-LABEL: name: workgroup_one_as_seq_cst
   ; GFX8: bb.0.entry:
+  ; GFX8-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX8-NEXT:   S_ENDPGM 0
   ;
   ; GFX10WGP-LABEL: name: workgroup_one_as_seq_cst
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10WGP-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: workgroup_one_as_seq_cst
   ; GFX10CU: bb.0.entry:
+  ; GFX10CU-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_one_as_seq_cst
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11WGP-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: workgroup_one_as_seq_cst
   ; GFX11CU: bb.0.entry:
+  ; GFX11CU-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("workgroup-one-as") seq_cst
@@ -1283,33 +1301,39 @@ define amdgpu_kernel void @workgroup_release() #0 {
   ; GFX6-LABEL: name: workgroup_release
   ; GFX6: bb.0.entry:
   ; GFX6-NEXT:   S_WAITCNT_soft 127
+  ; GFX6-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX6-NEXT:   S_ENDPGM 0
   ;
   ; GFX8-LABEL: name: workgroup_release
   ; GFX8: bb.0.entry:
   ; GFX8-NEXT:   S_WAITCNT_soft 127
+  ; GFX8-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX8-NEXT:   S_ENDPGM 0
   ;
   ; GFX10WGP-LABEL: name: workgroup_release
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 112
+  ; GFX10WGP-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX10CU-LABEL: name: workgroup_release
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   S_WAITCNT_soft 49279
+  ; GFX10CU-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_release
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 7
+  ; GFX11WGP-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   S_ENDPGM 0
   ;
   ; GFX11CU-LABEL: name: workgroup_release
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   S_WAITCNT_soft 64519
+  ; GFX11CU-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("workgroup") release
@@ -1320,16 +1344,19 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 {
   ; GFX6-LABEL: name: workgroup_acq_rel
   ; GFX6: bb.0.entry:
   ; GFX6-NEXT:   S_WAITCNT_soft 127
+  ; GFX6-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX6-NEXT:   S_ENDPGM 0
   ;
   ; GFX8-LABEL: name: workgroup_acq_rel
   ; GFX8: bb.0.entry:
   ; GFX8-NEXT:   S_WAITCNT_soft 127
+  ; GFX8-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX8-NEXT:   S_ENDPGM 0
   ;
   ; GFX10WGP-LABEL: name: workgroup_acq_rel
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 112
+  ; GFX10WGP-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
@@ -1337,11 +1364,13 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 {
   ; GFX10CU-LABEL: name: workgroup_acq_rel
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   S_WAITCNT_soft 49279
+  ; GFX10CU-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_acq_rel
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 7
+  ; GFX11WGP-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
@@ -1349,6 +1378,7 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 {
   ; GFX11CU-LABEL: name: workgroup_acq_rel
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   S_WAITCNT_soft 64519
+  ; GFX11CU-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("workgroup") acq_rel
@@ -1359,16 +1389,19 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 {
   ; GFX6-LABEL: name: workgroup_seq_cst
   ; GFX6: bb.0.entry:
   ; GFX6-NEXT:   S_WAITCNT_soft 127
+  ; GFX6-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX6-NEXT:   S_ENDPGM 0
   ;
   ; GFX8-LABEL: name: workgroup_seq_cst
   ; GFX8: bb.0.entry:
   ; GFX8-NEXT:   S_WAITCNT_soft 127
+  ; GFX8-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX8-NEXT:   S_ENDPGM 0
   ;
   ; GFX10WGP-LABEL: name: workgroup_seq_cst
   ; GFX10WGP: bb.0.entry:
   ; GFX10WGP-NEXT:   S_WAITCNT_soft 112
+  ; GFX10WGP-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX10WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX10WGP-NEXT:   S_ENDPGM 0
@@ -1376,11 +1409,13 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 {
   ; GFX10CU-LABEL: name: workgroup_seq_cst
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   S_WAITCNT_soft 49279
+  ; GFX10CU-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_seq_cst
   ; GFX11WGP: bb.0.entry:
   ; GFX11WGP-NEXT:   S_WAITCNT_soft 7
+  ; GFX11WGP-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX11WGP-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11WGP-NEXT:   BUFFER_GL0_INV implicit $exec
   ; GFX11WGP-NEXT:   S_ENDPGM 0
@@ -1388,6 +1423,7 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 {
   ; GFX11CU-LABEL: name: workgroup_seq_cst
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   S_WAITCNT_soft 64519
+  ; GFX11CU-NEXT:   S_WAITCNT_FENCE_soft 5, 3, 15
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("workgroup") seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir
new file mode 100644
index 0000000000000..b5f5a359ee49b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir
@@ -0,0 +1,153 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GCN %s
+
+
+# Expected vmcnt(0) since the direct load is the only load.
+---
+name: dma_then_fence
+body:             |
+  bb.0:
+    ; GCN-LABEL: name: dma_then_fence
+    ; GCN: S_WAITCNT 0
+    ; GCN-NEXT: $m0 = S_MOV_B32 0
+    ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
+    ; GCN-NEXT: S_WAITCNT 3952
+    ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    $m0 = S_MOV_B32 0
+    BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+    S_WAITCNT_FENCE_soft 5, 3, 15
+    $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    S_ENDPGM 0
+
+...
+
+# Expected vmcnt(1) since the global load is not processed by SIInsertWaitcnts.
+
+---
+name: dma_then_global_load
+body:             |
+  bb.0:
+    ; GCN-LABEL: name: dma_then_global_load
+    ; GCN: S_WAITCNT 0
+    ; GCN-NEXT: $m0 = S_MOV_B32 0
+    ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
+    ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+    ; GCN-NEXT: S_WAITCNT 3953
+    ; GCN-NEXT: $vgpr1 = V_AD...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/150167