[llvm] [clang] [clang-tools-extra] [AMDGPU] CodeGen for GFX12 S_WAIT_* instructions (PR #77438)

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Fri Jan 12 09:20:32 PST 2024


https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/77438

>From 2cd8fc68e14cb5ae1a7f576ac84b4e3a5cc6111e Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Tue, 9 Jan 2024 10:24:38 +0000
Subject: [PATCH] [AMDGPU] CodeGen for GFX12 S_WAIT_* instructions

Update SIMemoryLegalizer and SIInsertWaitcnts to use separate wait
instructions per counter (e.g. S_WAIT_LOADCNT) and split VMCNT into
separate LOADCNT, SAMPLECNT and BVHCNT counters.
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp |    3 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |    4 +
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp   | 1020 +++++++++++++----
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |    3 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |   37 +-
 llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp  |  180 +++
 llvm/lib/Target/AMDGPU/SOPInstructions.td     |    7 +
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  146 ++-
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  123 +-
 .../CodeGen/AMDGPU/GlobalISel/addsubu64.ll    |    4 +-
 .../GlobalISel/clamp-fmed3-const-combine.ll   |   48 +-
 .../GlobalISel/clamp-minmax-const-combine.ll  |  102 +-
 .../GlobalISel/extractelement-stack-lower.ll  |   24 +-
 .../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll |  220 ++--
 .../GlobalISel/fmed3-min-max-const-combine.ll |  108 +-
 .../llvm.amdgcn.global.atomic.csub.ll         |   38 +-
 .../llvm.amdgcn.image.atomic.dim.a16.ll       |   84 +-
 .../llvm.amdgcn.image.atomic.dim.ll           |   84 +-
 .../llvm.amdgcn.image.gather4.a16.dim.ll      |   28 +-
 .../llvm.amdgcn.image.gather4.dim.ll          |   36 +-
 .../llvm.amdgcn.image.getresinfo.a16.ll       |   16 +-
 .../llvm.amdgcn.image.getresinfo.ll           |   22 +-
 .../llvm.amdgcn.image.load.1d.d16.ll          |   28 +-
 .../GlobalISel/llvm.amdgcn.image.load.1d.ll   |   30 +-
 .../GlobalISel/llvm.amdgcn.image.load.2d.ll   |    6 +-
 .../llvm.amdgcn.image.load.2darraymsaa.a16.ll |    6 +-
 .../llvm.amdgcn.image.load.2darraymsaa.ll     |    6 +-
 .../llvm.amdgcn.image.load.3d.a16.ll          |    6 +-
 .../GlobalISel/llvm.amdgcn.image.load.3d.ll   |    6 +-
 .../llvm.amdgcn.image.sample.g16.ll           |   22 +-
 .../GlobalISel/llvm.amdgcn.rsq.clamp.ll       |   48 +-
 .../AMDGPU/GlobalISel/load-constant.96.ll     |  140 ++-
 .../CodeGen/AMDGPU/GlobalISel/mubuf-global.ll |   84 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll    |   66 +-
 llvm/test/CodeGen/AMDGPU/add.ll               |   42 +-
 .../AMDGPU/atomic_optimizations_buffer.ll     |   92 +-
 .../atomic_optimizations_global_pointer.ll    |  128 +--
 .../AMDGPU/atomic_optimizations_raw_buffer.ll |   80 +-
 .../atomic_optimizations_struct_buffer.ll     |   96 +-
 llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll  |   45 +-
 .../AMDGPU/cgp-addressing-modes-smem.ll       |   12 +-
 llvm/test/CodeGen/AMDGPU/clamp.ll             |  184 +--
 .../fast-unaligned-load-store.global.ll       |   30 +-
 .../fast-unaligned-load-store.private.ll      |   84 +-
 .../CodeGen/AMDGPU/flat-scratch-i8-i16.ll     |   72 +-
 llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll  |  252 ++--
 llvm/test/CodeGen/AMDGPU/flat-scratch.ll      |  648 ++++++-----
 llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll  |  531 ++++-----
 llvm/test/CodeGen/AMDGPU/fmaximum.ll          |   20 +-
 llvm/test/CodeGen/AMDGPU/fminimum.ll          |   20 +-
 .../AMDGPU/fp-min-max-num-flat-atomics.ll     |    8 +-
 .../AMDGPU/fp-min-max-num-global-atomics.ll   |    4 +-
 llvm/test/CodeGen/AMDGPU/global-saddr-load.ll |  376 +++---
 .../test/CodeGen/AMDGPU/global-saddr-store.ll |   10 +-
 .../test/CodeGen/AMDGPU/global_atomics_i64.ll |  422 +++----
 ...vm.amdgcn.global.atomic.ordered.add.b64.ll |   12 +-
 .../AMDGPU/llvm.amdgcn.image.a16.dim.ll       |   54 +-
 .../AMDGPU/llvm.amdgcn.image.a16.encode.ll    |   54 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll   |  108 +-
 .../llvm.amdgcn.image.gather4.a16.dim.ll      |   28 +-
 .../AMDGPU/llvm.amdgcn.image.getlod.dim.ll    |   14 +-
 .../AMDGPU/llvm.amdgcn.image.msaa.load.ll     |   26 +-
 .../llvm.amdgcn.image.sample.a16.dim.ll       |   78 +-
 .../llvm.amdgcn.image.sample.d16.dim.ll       |   16 +-
 .../AMDGPU/llvm.amdgcn.image.sample.dim.ll    |  124 +-
 .../llvm.amdgcn.image.sample.g16.encode.ll    |   22 +-
 .../AMDGPU/llvm.amdgcn.image.sample.g16.ll    |   22 +-
 .../AMDGPU/llvm.amdgcn.lds.direct.load.ll     |   10 +-
 .../AMDGPU/llvm.amdgcn.lds.param.load.ll      |   14 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll    |   76 +-
 .../AMDGPU/llvm.amdgcn.permlane16.var.ll      |  104 +-
 .../AMDGPU/llvm.amdgcn.raw.buffer.load.ll     |   86 +-
 .../llvm.amdgcn.raw.tbuffer.load.d16.ll       |    8 +-
 .../AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll    |   24 +-
 .../llvm.amdgcn.raw.tbuffer.store.d16.ll      |   10 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll   |   12 +-
 .../AMDGPU/llvm.amdgcn.s.barrier.wait.ll      |  218 ++--
 .../AMDGPU/llvm.amdgcn.s.buffer.load.ll       |   72 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll |   14 +-
 .../llvm.amdgcn.struct.buffer.load.format.ll  |   52 +-
 ....amdgcn.struct.buffer.load.format.v3f16.ll |    2 +-
 .../llvm.amdgcn.struct.tbuffer.load.d16.ll    |    8 +-
 .../AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll |   30 +-
 .../llvm.amdgcn.struct.tbuffer.store.d16.ll   |   10 +-
 .../llvm.amdgcn.struct.tbuffer.store.ll       |    2 +-
 llvm/test/CodeGen/AMDGPU/load-constant-f32.ll |    4 +-
 llvm/test/CodeGen/AMDGPU/load-constant-f64.ll |    8 +-
 llvm/test/CodeGen/AMDGPU/load-constant-i1.ll  |  176 +--
 llvm/test/CodeGen/AMDGPU/load-constant-i16.ll |  158 +--
 llvm/test/CodeGen/AMDGPU/load-constant-i32.ll |  100 +-
 llvm/test/CodeGen/AMDGPU/load-constant-i64.ll |   24 +-
 llvm/test/CodeGen/AMDGPU/load-constant-i8.ll  |  208 ++--
 .../test/CodeGen/AMDGPU/loop-prefetch-data.ll |   24 +-
 .../memory-legalizer-private-nontemporal.ll   |   48 +-
 .../memory-legalizer-private-volatile.ll      |   56 +-
 llvm/test/CodeGen/AMDGPU/mul.ll               |   72 +-
 llvm/test/CodeGen/AMDGPU/offset-split-flat.ll |  566 +++++----
 .../CodeGen/AMDGPU/offset-split-global.ll     |  566 +++++----
 llvm/test/CodeGen/AMDGPU/readcyclecounter.ll  |    4 +-
 llvm/test/CodeGen/AMDGPU/sub.ll               |   54 +-
 .../CodeGen/AMDGPU/waitcnt-global-inv-wb.mir  |    2 +-
 101 files changed, 5599 insertions(+), 3722 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index a7d8ff0242b801..7ca7722a5cebd1 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -1242,7 +1242,8 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
       case AMDGPU::S_WAITCNT: {
         const int64_t Imm = MI.getOperand(0).getImm();
         AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
-        return (Decoded.LgkmCnt == 0);
+        // DsCnt corresponds to LGKMCnt here.
+        return (Decoded.DsCnt == 0);
       }
       default:
         // SOPP instructions cannot mitigate the hazard.
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index f6f37f5170a403..f3d38c018d2709 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1175,6 +1175,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
 
+  /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
+  /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
+  bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
+
   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
   /// SGPRs
   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 1cb1d32707f2d7..37890881d57544 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -57,7 +57,18 @@ namespace {
 // associated with the operand.  Used for determining whether
 // s_waitcnt instruction needs to be emitted.
 
-enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
+enum InstCounterType {
+  LOAD_CNT = 0, // VMcnt prior to gfx12.
+  DS_CNT,       // LKGMcnt prior to gfx12.
+  EXP_CNT,      //
+  STORE_CNT,    // VScnt in gfx10/gfx11.
+  NUM_NORMAL_INST_CNTS,
+  SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
+  BVH_CNT,                           // gfx12+ only.
+  KM_CNT,                            // gfx12+ only.
+  NUM_EXTENDED_INST_CNTS,
+  NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
+};
 } // namespace
 
 namespace llvm {
@@ -67,15 +78,23 @@ template <> struct enum_iteration_traits<InstCounterType> {
 } // namespace llvm
 
 namespace {
-auto inst_counter_types() { return enum_seq(VM_CNT, NUM_INST_CNTS); }
+// Return an iterator over all counters between LOAD_CNT (the first counter)
+// and \c MaxCounter (exclusive, default value yields an enumeration over
+// all counters).
+auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
+  return enum_seq(LOAD_CNT, MaxCounter);
+}
 
 using RegInterval = std::pair<int, int>;
 
 struct HardwareLimits {
-  unsigned VmcntMax;
+  unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
   unsigned ExpcntMax;
-  unsigned LgkmcntMax;
-  unsigned VscntMax;
+  unsigned DscntMax;     // Corresponds to LGKMcnt prior to gfx12.
+  unsigned StorecntMax;  // Corresponds to VScnt in gfx10/gfx11.
+  unsigned SamplecntMax; // gfx12+ only.
+  unsigned BvhcntMax;    // gfx12+ only.
+  unsigned KmcntMax;     // gfx12+ only.
 };
 
 struct RegisterEncoding {
@@ -86,31 +105,25 @@ struct RegisterEncoding {
 };
 
 enum WaitEventType {
-  VMEM_ACCESS,          // vector-memory read & write
-  VMEM_READ_ACCESS,     // vector-memory read
-  VMEM_WRITE_ACCESS,    // vector-memory write that is not scratch
-  SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch
-  LDS_ACCESS,           // lds read & write
-  GDS_ACCESS,           // gds read & write
-  SQ_MESSAGE,           // send message
-  SMEM_ACCESS,          // scalar-memory read & write
-  EXP_GPR_LOCK,         // export holding on its data src
-  GDS_GPR_LOCK,         // GDS holding on its data and addr src
-  EXP_POS_ACCESS,       // write to export position
-  EXP_PARAM_ACCESS,     // write to export parameter
-  VMW_GPR_LOCK,         // vector-memory write holding on its data src
-  EXP_LDS_ACCESS,       // read by ldsdir counting as export
+  VMEM_ACCESS,              // vector-memory read & write
+  VMEM_READ_ACCESS,         // vector-memory read
+  VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only)
+  VMEM_BVH_READ_ACCESS,     // vector-memory BVH read (gfx12+ only)
+  VMEM_WRITE_ACCESS,        // vector-memory write that is not scratch
+  SCRATCH_WRITE_ACCESS,     // vector-memory write that may be scratch
+  LDS_ACCESS,               // lds read & write
+  GDS_ACCESS,               // gds read & write
+  SQ_MESSAGE,               // send message
+  SMEM_ACCESS,              // scalar-memory read & write
+  EXP_GPR_LOCK,             // export holding on its data src
+  GDS_GPR_LOCK,             // GDS holding on its data and addr src
+  EXP_POS_ACCESS,           // write to export position
+  EXP_PARAM_ACCESS,         // write to export parameter
+  VMW_GPR_LOCK,             // vector-memory write holding on its data src
+  EXP_LDS_ACCESS,           // read by ldsdir counting as export
   NUM_WAIT_EVENTS,
 };
 
-static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
-    (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
-    (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
-        (1 << SQ_MESSAGE),
-    (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
-        (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | (1 << EXP_LDS_ACCESS),
-    (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS)};
-
 // The mapping is:
 //  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
 //  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
@@ -137,17 +150,33 @@ enum VmemType {
   // MIMG instructions with a sampler.
   VMEM_SAMPLER,
   // BVH instructions
-  VMEM_BVH
+  VMEM_BVH,
+  NUM_VMEM_TYPES
 };
 
+// Maps values of InstCounterType to the instruction that waits on that
+// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
+// returns true.
+static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
+    AMDGPU::S_WAIT_LOADCNT,  AMDGPU::S_WAIT_DSCNT,     AMDGPU::S_WAIT_EXPCNT,
+    AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
+    AMDGPU::S_WAIT_KMCNT};
+
 static bool updateVMCntOnly(const MachineInstr &Inst) {
   return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) ||
          SIInstrInfo::isFLATScratch(Inst);
 }
 
+#ifndef NDEBUG
+static bool isNormalMode(InstCounterType MaxCounter) {
+  return MaxCounter == NUM_NORMAL_INST_CNTS;
+}
+#endif // NDEBUG
+
 VmemType getVmemType(const MachineInstr &Inst) {
   assert(updateVMCntOnly(Inst));
-  if (!SIInstrInfo::isMIMG(Inst))
+  if (!SIInstrInfo::isMIMG(Inst) && !SIInstrInfo::isVIMAGE(Inst) &&
+      !SIInstrInfo::isVSAMPLE(Inst))
     return VMEM_NOSAMPLER;
   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
   const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
@@ -156,25 +185,49 @@ VmemType getVmemType(const MachineInstr &Inst) {
                        : BaseInfo->Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER;
 }
 
-void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
+unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
   switch (T) {
-  case VM_CNT:
-    Wait.VmCnt = std::min(Wait.VmCnt, Count);
-    break;
+  case LOAD_CNT:
+    return Wait.LoadCnt;
   case EXP_CNT:
-    Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
-    break;
-  case LGKM_CNT:
-    Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
-    break;
-  case VS_CNT:
-    Wait.VsCnt = std::min(Wait.VsCnt, Count);
-    break;
+    return Wait.ExpCnt;
+  case DS_CNT:
+    return Wait.DsCnt;
+  case STORE_CNT:
+    return Wait.StoreCnt;
+  case SAMPLE_CNT:
+    return Wait.SampleCnt;
+  case BVH_CNT:
+    return Wait.BvhCnt;
+  case KM_CNT:
+    return Wait.KmCnt;
   default:
     llvm_unreachable("bad InstCounterType");
   }
 }
 
+void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
+  unsigned &WC = getCounterRef(Wait, T);
+  WC = std::min(WC, Count);
+}
+
+void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
+  getCounterRef(Wait, T) = ~0u;
+}
+
+unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
+  return getCounterRef(Wait, T);
+}
+
+// Mapping from event to counter according to the table masks.
+InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
+  for (auto T : inst_counter_types()) {
+    if (masks[T] & (1 << E))
+      return T;
+  }
+  llvm_unreachable("event type has no associated counter");
+}
+
 // This objects maintains the current score brackets of each wait counter, and
 // a per-register scoreboard for each wait counter.
 //
@@ -185,20 +238,30 @@ void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
 // "s_waitcnt 0" before use.
 class WaitcntBrackets {
 public:
-  WaitcntBrackets(const GCNSubtarget *SubTarget, HardwareLimits Limits,
-                  RegisterEncoding Encoding)
-      : ST(SubTarget), Limits(Limits), Encoding(Encoding) {}
+  WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
+                  HardwareLimits Limits, RegisterEncoding Encoding,
+                  const unsigned *WaitEventMaskForInst,
+                  InstCounterType SmemAccessCounter)
+      : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
+        Encoding(Encoding), WaitEventMaskForInst(WaitEventMaskForInst),
+        SmemAccessCounter(SmemAccessCounter) {}
 
   unsigned getWaitCountMax(InstCounterType T) const {
     switch (T) {
-    case VM_CNT:
-      return Limits.VmcntMax;
-    case LGKM_CNT:
-      return Limits.LgkmcntMax;
+    case LOAD_CNT:
+      return Limits.LoadcntMax;
+    case DS_CNT:
+      return Limits.DscntMax;
     case EXP_CNT:
       return Limits.ExpcntMax;
-    case VS_CNT:
-      return Limits.VscntMax;
+    case STORE_CNT:
+      return Limits.StorecntMax;
+    case SAMPLE_CNT:
+      return Limits.SamplecntMax;
+    case BVH_CNT:
+      return Limits.BvhcntMax;
+    case KM_CNT:
+      return Limits.KmcntMax;
     default:
       break;
     }
@@ -219,20 +282,11 @@ class WaitcntBrackets {
     return getScoreUB(T) - getScoreLB(T);
   }
 
-  // Mapping from event to counter.
-  InstCounterType eventCounter(WaitEventType E) const {
-    for (auto T : inst_counter_types()) {
-      if (WaitEventMaskForInst[T] & (1 << E))
-        return T;
-    }
-    llvm_unreachable("event type has no associated counter");
-  }
-
   unsigned getRegScore(int GprNo, InstCounterType T) const {
     if (GprNo < NUM_ALL_VGPRS) {
       return VgprScores[T][GprNo];
     }
-    assert(T == LGKM_CNT);
+    assert(T == SmemAccessCounter);
     return SgprScores[GprNo - NUM_ALL_VGPRS];
   }
 
@@ -269,15 +323,15 @@ class WaitcntBrackets {
   }
 
   bool hasPendingFlat() const {
-    return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
-             LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
-            (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
-             LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
+    return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
+             LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
+            (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
+             LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
   }
 
   void setPendingFlat() {
-    LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
-    LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
+    LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
+    LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
   }
 
   // Return true if there might be pending writes to the specified vgpr by VMEM
@@ -293,8 +347,8 @@ class WaitcntBrackets {
   }
 
   void setNonKernelFunctionInitialState() {
-    setScoreUB(VS_CNT, getWaitCountMax(VS_CNT));
-    PendingEvents |= WaitEventMaskForInst[VS_CNT];
+    setScoreUB(STORE_CNT, getWaitCountMax(STORE_CNT));
+    PendingEvents |= WaitEventMaskForInst[STORE_CNT];
   }
 
   void print(raw_ostream &);
@@ -331,7 +385,7 @@ class WaitcntBrackets {
       VgprUB = std::max(VgprUB, GprNo);
       VgprScores[T][GprNo] = Val;
     } else {
-      assert(T == LGKM_CNT);
+      assert(T == SmemAccessCounter);
       SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
       SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
     }
@@ -342,8 +396,11 @@ class WaitcntBrackets {
                    unsigned OpNo, unsigned Val);
 
   const GCNSubtarget *ST = nullptr;
+  InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
   HardwareLimits Limits = {};
   RegisterEncoding Encoding = {};
+  const unsigned *WaitEventMaskForInst;
+  InstCounterType SmemAccessCounter;
   unsigned ScoreLBs[NUM_INST_CNTS] = {0};
   unsigned ScoreUBs[NUM_INST_CNTS] = {0};
   unsigned PendingEvents = 0;
@@ -354,20 +411,139 @@ class WaitcntBrackets {
   int VgprUB = -1;
   int SgprUB = -1;
   unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
-  // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
+  // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt
+  // pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
   unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
   // Bitmask of the VmemTypes of VMEM instructions that might have a pending
   // write to each vgpr.
   unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
 };
 
+// This abstracts the logic for generating and updating S_WAIT* instructions
+// away from the analysis that determines where they are needed. This was
+// done because the set of counters and instructions for waiting on them
+// underwent a major shift with gfx12, sufficiently so that having this
+// abstraction allows the main analysis logic to be simpler than it would
+// otherwise have had to become.
+class WaitcntGenerator {
+protected:
+  const GCNSubtarget *ST = nullptr;
+  const SIInstrInfo *TII = nullptr;
+  AMDGPU::IsaVersion IV;
+  InstCounterType MaxCounter;
+
+public:
+  WaitcntGenerator() {}
+  WaitcntGenerator(const GCNSubtarget *ST, InstCounterType MaxCounter)
+      : ST(ST), TII(ST->getInstrInfo()),
+        IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter) {}
+
+  // Edits an existing sequence of wait count instructions according
+  // to an incoming Waitcnt value, which is itself updated to reflect
+  // any new wait count instructions which may need to be generated by
+  // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
+  // were made.
+  //
+  // This editing will usually be merely updated operands, but it may also
+  // delete instructions if the incoming Wait value indicates they are not
+  // needed. It may also remove existing instructions for which a wait
+  // is needed if it can be determined that it is better to generate new
+  // instructions later, as can happen on gfx12.
+  virtual bool
+  applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
+                          MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
+                          MachineBasicBlock::instr_iterator It) const = 0;
+
+  // Transform a soft waitcnt into a normal one.
+  bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
+
+  // Generates new wait count instructions according to the  value of
+  // Wait, returning true if any new instructions were created.
+  virtual bool createNewWaitcnt(MachineBasicBlock &Block,
+                                MachineBasicBlock::instr_iterator It,
+                                AMDGPU::Waitcnt Wait) = 0;
+
+  // Returns an array of bit masks which can be used to map values in
+  // WaitEventType to corresponding counter values in InstCounterType.
+  virtual const unsigned *getWaitEventMask() const = 0;
+
+  virtual ~WaitcntGenerator() = default;
+};
+
+class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
+public:
+  WaitcntGeneratorPreGFX12() {}
+  WaitcntGeneratorPreGFX12(const GCNSubtarget *ST)
+      : WaitcntGenerator(ST, NUM_NORMAL_INST_CNTS) {}
+
+  bool
+  applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
+                          MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
+                          MachineBasicBlock::instr_iterator It) const override;
+
+  bool createNewWaitcnt(MachineBasicBlock &Block,
+                        MachineBasicBlock::instr_iterator It,
+                        AMDGPU::Waitcnt Wait) override;
+
+  const unsigned *getWaitEventMask() const override {
+    assert(ST);
+
+    static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
+        (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS) |
+            (1 << VMEM_SAMPLER_READ_ACCESS) | (1 << VMEM_BVH_READ_ACCESS),
+        (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
+            (1 << SQ_MESSAGE),
+        (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
+            (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) |
+            (1 << EXP_LDS_ACCESS),
+        (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS),
+        0,
+        0,
+        0};
+
+    return WaitEventMaskForInstPreGFX12;
+  }
+};
+
+class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
+public:
+  WaitcntGeneratorGFX12Plus() {}
+  WaitcntGeneratorGFX12Plus(const GCNSubtarget *ST, InstCounterType MaxCounter)
+      : WaitcntGenerator(ST, MaxCounter) {}
+
+  bool
+  applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
+                          MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
+                          MachineBasicBlock::instr_iterator It) const override;
+
+  bool createNewWaitcnt(MachineBasicBlock &Block,
+                        MachineBasicBlock::instr_iterator It,
+                        AMDGPU::Waitcnt Wait) override;
+
+  const unsigned *getWaitEventMask() const override {
+    assert(ST);
+
+    static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
+        (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
+        (1 << LDS_ACCESS) | (1 << GDS_ACCESS),
+        (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
+            (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) |
+            (1 << EXP_LDS_ACCESS),
+        (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS),
+        (1 << VMEM_SAMPLER_READ_ACCESS),
+        (1 << VMEM_BVH_READ_ACCESS),
+        (1 << SMEM_ACCESS) | (1 << SQ_MESSAGE)};
+
+    return WaitEventMaskForInstGFX12Plus;
+  }
+};
+
 class SIInsertWaitcnts : public MachineFunctionPass {
 private:
   const GCNSubtarget *ST = nullptr;
   const SIInstrInfo *TII = nullptr;
   const SIRegisterInfo *TRI = nullptr;
   const MachineRegisterInfo *MRI = nullptr;
-  AMDGPU::IsaVersion IV;
 
   DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
   DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
@@ -379,6 +555,8 @@ class SIInsertWaitcnts : public MachineFunctionPass {
     bool Dirty = true;
   };
 
+  InstCounterType SmemAccessCounter;
+
   MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
 
   // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
@@ -388,10 +566,20 @@ class SIInsertWaitcnts : public MachineFunctionPass {
 
   bool OptNone;
 
+  // In any given run of this pass, WCG will point to one of these two
+  // generator objects, which must have been re-initialised before use
+  // from a value made using a subtarget constructor.
+  WaitcntGeneratorPreGFX12 WCGPreGFX12;
+  WaitcntGeneratorGFX12Plus WCGGFX12Plus;
+
+  WaitcntGenerator *WCG = nullptr;
+
   // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
   // message.
   DenseSet<MachineInstr *> ReleaseVGPRInsts;
 
+  InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
+
 public:
   static char ID;
 
@@ -438,16 +626,22 @@ class SIInsertWaitcnts : public MachineFunctionPass {
 
     if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
         DebugCounter::shouldExecute(ForceLgkmCounter)) {
-      ForceEmitWaitcnt[LGKM_CNT] = true;
+      ForceEmitWaitcnt[DS_CNT] = true;
+      ForceEmitWaitcnt[KM_CNT] = true;
     } else {
-      ForceEmitWaitcnt[LGKM_CNT] = false;
+      ForceEmitWaitcnt[DS_CNT] = false;
+      ForceEmitWaitcnt[KM_CNT] = false;
     }
 
     if (DebugCounter::isCounterSet(ForceVMCounter) &&
         DebugCounter::shouldExecute(ForceVMCounter)) {
-      ForceEmitWaitcnt[VM_CNT] = true;
+      ForceEmitWaitcnt[LOAD_CNT] = true;
+      ForceEmitWaitcnt[SAMPLE_CNT] = true;
+      ForceEmitWaitcnt[BVH_CNT] = true;
     } else {
-      ForceEmitWaitcnt[VM_CNT] = false;
+      ForceEmitWaitcnt[LOAD_CNT] = false;
+      ForceEmitWaitcnt[SAMPLE_CNT] = false;
+      ForceEmitWaitcnt[BVH_CNT] = false;
     }
 #endif // NDEBUG
   }
@@ -455,6 +649,10 @@ class SIInsertWaitcnts : public MachineFunctionPass {
   // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or
   // FLAT instruction.
   WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
+    // Maps VMEM access types to their corresponding WaitEventType.
+    static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
+        VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
+
     assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst));
     // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
     // these should use VM_CNT.
@@ -467,7 +665,9 @@ class SIInsertWaitcnts : public MachineFunctionPass {
         return SCRATCH_WRITE_ACCESS;
       return VMEM_WRITE_ACCESS;
     }
-    return VMEM_READ_ACCESS;
+    if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
+      return VMEM_READ_ACCESS;
+    return VmemReadMapping[getVmemType(Inst)];
   }
 
   bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
@@ -488,13 +688,6 @@ class SIInsertWaitcnts : public MachineFunctionPass {
                                WaitcntBrackets *ScoreBrackets);
   bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
                             WaitcntBrackets &ScoreBrackets);
-  bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
-                               MachineInstr &OldWaitcntInstr,
-                               AMDGPU::Waitcnt &Wait,
-                               MachineBasicBlock::instr_iterator It) const;
-
-  // Transform a soft waitcnt into a normal one.
-  bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
 };
 
 } // end anonymous namespace
@@ -556,8 +749,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
                                     const SIRegisterInfo *TRI,
                                     const MachineRegisterInfo *MRI,
                                     WaitEventType E, MachineInstr &Inst) {
-  InstCounterType T = eventCounter(E);
-  unsigned CurrScore = getScoreUB(T) + 1;
+  InstCounterType T = eventCounter(WaitEventMaskForInst, E);
+
+  unsigned UB = getScoreUB(T);
+  unsigned CurrScore = UB + 1;
   if (CurrScore == 0)
     report_fatal_error("InsertWaitcnt score wraparound");
   // PendingEvents and ScoreUB need to be update regardless if this event
@@ -686,7 +881,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
       if (!Op.isReg() || !Op.isDef())
         continue;
       RegInterval Interval = getRegInterval(&Inst, MRI, TRI, I);
-      if (T == VM_CNT) {
+      if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
         if (Interval.first >= NUM_ALL_VGPRS)
           continue;
         if (updateVMCntOnly(Inst)) {
@@ -714,21 +909,33 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
 
 void WaitcntBrackets::print(raw_ostream &OS) {
   OS << '\n';
-  for (auto T : inst_counter_types()) {
+  for (auto T : inst_counter_types(MaxCounter)) {
     unsigned SR = getScoreRange(T);
 
     switch (T) {
-    case VM_CNT:
-      OS << "    VM_CNT(" << SR << "): ";
+    case LOAD_CNT:
+      OS << "    " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
+         << SR << "): ";
       break;
-    case LGKM_CNT:
-      OS << "    LGKM_CNT(" << SR << "): ";
+    case DS_CNT:
+      OS << "    " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
+         << SR << "): ";
       break;
     case EXP_CNT:
       OS << "    EXP_CNT(" << SR << "): ";
       break;
-    case VS_CNT:
-      OS << "    VS_CNT(" << SR << "): ";
+    case STORE_CNT:
+      OS << "    " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
+         << SR << "): ";
+      break;
+    case SAMPLE_CNT:
+      OS << "    SAMPLE_CNT(" << SR << "): ";
+      break;
+    case BVH_CNT:
+      OS << "    BVH_CNT(" << SR << "): ";
+      break;
+    case KM_CNT:
+      OS << "    KM_CNT(" << SR << "): ";
       break;
     default:
       OS << "    UNKNOWN(" << SR << "): ";
@@ -751,9 +958,9 @@ void WaitcntBrackets::print(raw_ostream &OS) {
         }
       }
       // Also need to print sgpr scores for lgkm_cnt.
-      if (T == LGKM_CNT) {
+      if (T == SmemAccessCounter) {
         for (int J = 0; J <= SgprUB; J++) {
-          unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
+          unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T);
           if (RegScore <= LB)
             continue;
           unsigned RelScore = RegScore - LB - 1;
@@ -769,10 +976,13 @@ void WaitcntBrackets::print(raw_ostream &OS) {
 /// Simplify the waitcnt, in the sense of removing redundant counts, and return
 /// whether a waitcnt instruction is needed at all.
 void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
-  simplifyWaitcnt(VM_CNT, Wait.VmCnt);
+  simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
   simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
-  simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
-  simplifyWaitcnt(VS_CNT, Wait.VsCnt);
+  simplifyWaitcnt(DS_CNT, Wait.DsCnt);
+  simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
+  simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
+  simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
+  simplifyWaitcnt(KM_CNT, Wait.KmCnt);
 }
 
 void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
@@ -793,8 +1003,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, int RegNo,
   const unsigned LB = getScoreLB(T);
   const unsigned UB = getScoreUB(T);
   if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
-    if ((T == VM_CNT || T == LGKM_CNT) &&
-        hasPendingFlat() &&
+    if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
         !ST->hasFlatLgkmVMemCountInOrder()) {
       // If there is a pending FLAT operation, and this is a VMem or LGKM
       // waitcnt and the target can report early completion, then we need
@@ -815,10 +1024,13 @@ void WaitcntBrackets::determineWait(InstCounterType T, int RegNo,
 }
 
 void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
-  applyWaitcnt(VM_CNT, Wait.VmCnt);
+  applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
   applyWaitcnt(EXP_CNT, Wait.ExpCnt);
-  applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
-  applyWaitcnt(VS_CNT, Wait.VsCnt);
+  applyWaitcnt(DS_CNT, Wait.DsCnt);
+  applyWaitcnt(STORE_CNT, Wait.StoreCnt);
+  applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
+  applyWaitcnt(BVH_CNT, Wait.BvhCnt);
+  applyWaitcnt(KM_CNT, Wait.KmCnt);
 }
 
 void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
@@ -839,7 +1051,7 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
 // the decrement may go out of order.
 bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
   // Scalar memory read always can go out of order.
-  if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
+  if (T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS))
     return true;
   return hasMixedPendingEvents(T);
 }
@@ -873,22 +1085,49 @@ static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName,
   return true;
 }
 
-bool SIInsertWaitcnts::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
-  unsigned Opcode = Waitcnt->getOpcode();
-  if (!SIInstrInfo::isSoftWaitcnt(Opcode))
+/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
+/// and if so, which counter it is waiting on.
+static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
+  switch (Opcode) {
+  case AMDGPU::S_WAIT_LOADCNT:
+    return LOAD_CNT;
+  case AMDGPU::S_WAIT_EXPCNT:
+    return EXP_CNT;
+  case AMDGPU::S_WAIT_STORECNT:
+    return STORE_CNT;
+  case AMDGPU::S_WAIT_SAMPLECNT:
+    return SAMPLE_CNT;
+  case AMDGPU::S_WAIT_BVHCNT:
+    return BVH_CNT;
+  case AMDGPU::S_WAIT_DSCNT:
+    return DS_CNT;
+  case AMDGPU::S_WAIT_KMCNT:
+    return KM_CNT;
+  default:
+    return {};
+  }
+}
+
+bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
+  unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
+  if (Opcode == Waitcnt->getOpcode())
     return false;
 
-  Waitcnt->setDesc(TII->get(SIInstrInfo::getNonSoftWaitcntOpcode(Opcode)));
+  Waitcnt->setDesc(TII->get(Opcode));
   return true;
 }
 
-/// Combine consecutive waitcnt instructions that precede \p It and follow
-/// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
-/// by previous passes. Currently this pass conservatively assumes that these
-/// preexisting waitcnt are required for correctness.
-bool SIInsertWaitcnts::applyPreexistingWaitcnt(
+/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
+/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
+/// from \p Wait that were added by previous passes. Currently this pass
+/// conservatively assumes that these preexisting waits are required for
+/// correctness.
+bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
     WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
     AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
+  assert(ST);
+  assert(isNormalMode(MaxCounter));
+
   bool Modified = false;
   MachineInstr *WaitcntInstr = nullptr;
   MachineInstr *WaitcntVsCntInstr = nullptr;
@@ -898,12 +1137,12 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(
     if (II.isMetaInstruction())
       continue;
 
-    unsigned Opcode = II.getOpcode();
-    bool IsSoft = SIInstrInfo::isSoftWaitcnt(Opcode);
+    unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
+    bool IsSoft = Opcode != II.getOpcode();
 
-    if (SIInstrInfo::isWaitcnt(Opcode)) {
-      // Update required wait count. If this is a soft waitcnt (= it was added
-      // by an earlier pass), it may be entirely removed.
+    // Update required wait count. If this is a soft waitcnt (= it was added
+    // by an earlier pass), it may be entirely removed.
+    if (Opcode == AMDGPU::S_WAITCNT) {
       unsigned IEnc = II.getOperand(0).getImm();
       AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
       if (IsSoft)
@@ -911,23 +1150,22 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(
       Wait = Wait.combined(OldWait);
 
       // Merge consecutive waitcnt of the same type by erasing multiples.
-      if (WaitcntInstr || (!Wait.hasWaitExceptVsCnt() && IsSoft)) {
+      if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && IsSoft)) {
         II.eraseFromParent();
         Modified = true;
       } else
         WaitcntInstr = &II;
-
     } else {
-      assert(SIInstrInfo::isWaitcntVsCnt(Opcode));
+      assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
       assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
 
       unsigned OldVSCnt =
           TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
       if (IsSoft)
-        ScoreBrackets.simplifyWaitcnt(InstCounterType::VS_CNT, OldVSCnt);
-      Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
+        ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
+      Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
 
-      if (WaitcntVsCntInstr || (!Wait.hasWaitVsCnt() && IsSoft)) {
+      if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && IsSoft)) {
         II.eraseFromParent();
         Modified = true;
       } else
@@ -935,18 +1173,19 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(
     }
   }
 
-  // Updated encoding of merged waitcnt with the required wait.
   if (WaitcntInstr) {
     Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
                                          AMDGPU::encodeWaitcnt(IV, Wait));
     Modified |= promoteSoftWaitCnt(WaitcntInstr);
 
-    ScoreBrackets.applyWaitcnt(Wait);
-    Wait.VmCnt = ~0u;
-    Wait.LgkmCnt = ~0u;
+    ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
+    ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
+    ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
+    Wait.LoadCnt = ~0u;
     Wait.ExpCnt = ~0u;
+    Wait.DsCnt = ~0u;
 
-    LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+    LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
                    ? dbgs()
                          << "applyPreexistingWaitcnt\n"
                          << "New Instr at block end: " << *WaitcntInstr << '\n'
@@ -957,12 +1196,13 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(
 
   if (WaitcntVsCntInstr) {
     Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
-                                         AMDGPU::OpName::simm16, Wait.VsCnt);
+                                         AMDGPU::OpName::simm16, Wait.StoreCnt);
     Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
-    ScoreBrackets.applyWaitcnt(Wait);
-    Wait.VsCnt = ~0u;
 
-    LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+    ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
+    Wait.StoreCnt = ~0u;
+
+    LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
                    ? dbgs() << "applyPreexistingWaitcnt\n"
                             << "New Instr at block end: " << *WaitcntVsCntInstr
                             << '\n'
@@ -974,6 +1214,293 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(
   return Modified;
 }
 
+/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
+/// required counters in \p Wait
+bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
+    MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
+    AMDGPU::Waitcnt Wait) {
+  assert(ST);
+  assert(isNormalMode(MaxCounter));
+
+  bool Modified = false;
+  const DebugLoc &DL = Block.findDebugLoc(It);
+
+  // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
+  // single instruction while VScnt has its own instruction.
+  if (Wait.hasWaitExceptStoreCnt()) {
+    unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+    [[maybe_unused]] auto SWaitInst =
+        BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+    Modified = true;
+
+    LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+               dbgs() << "New Instr: " << *SWaitInst << '\n');
+  }
+
+  if (Wait.hasWaitStoreCnt()) {
+    assert(ST->hasVscnt());
+
+    [[maybe_unused]] auto SWaitInst =
+        BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
+            .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+            .addImm(Wait.StoreCnt);
+    Modified = true;
+
+    LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+               dbgs() << "New Instr: " << *SWaitInst << '\n');
+  }
+
+  return Modified;
+}
+
+/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
+/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
+/// were added by previous passes. Currently this pass conservatively
+/// assumes that these preexisting waits are required for correctness.
+bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
+    WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
+    AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
+  assert(ST);
+  assert(!isNormalMode(MaxCounter));
+
+  bool Modified = false;
+  MachineInstr *CombinedLoadDsCntInstr = nullptr;
+  MachineInstr *CombinedStoreDsCntInstr = nullptr;
+  MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
+
+  for (auto &II :
+       make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
+    if (II.isMetaInstruction())
+      continue;
+
+    MachineInstr **UpdatableInstr;
+
+    // Update required wait count. If this is a soft waitcnt (= it was added
+    // by an earlier pass), it may be entirely removed.
+
+    unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
+    bool IsSoft = Opcode != II.getOpcode();
+
+    if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
+      unsigned OldEnc =
+          TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+      AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
+      if (IsSoft)
+        ScoreBrackets.simplifyWaitcnt(OldWait);
+      Wait = Wait.combined(OldWait);
+      UpdatableInstr = &CombinedLoadDsCntInstr;
+    } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
+      unsigned OldEnc =
+          TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+      AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
+      if (IsSoft)
+        ScoreBrackets.simplifyWaitcnt(OldWait);
+      Wait = Wait.combined(OldWait);
+      UpdatableInstr = &CombinedStoreDsCntInstr;
+    } else {
+      std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
+      assert(CT.has_value());
+      unsigned OldCnt =
+          TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+      if (IsSoft)
+        ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
+      addWait(Wait, CT.value(), OldCnt);
+      UpdatableInstr = &WaitInstrs[CT.value()];
+    }
+
+    // Merge consecutive waitcnt of the same type by erasing multiples.
+    if (!*UpdatableInstr) {
+      *UpdatableInstr = &II;
+    } else {
+      II.eraseFromParent();
+      Modified = true;
+    }
+  }
+
+  if (CombinedLoadDsCntInstr) {
+    // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
+    // to be waited for. Otherwise, let the instruction be deleted so
+    // the appropriate single counter wait instruction can be inserted
+    // instead, when new S_WAIT_*CNT instructions are inserted by
+    // createNewWaitcnt(). As a side effect, resetting the wait counts will
+    // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
+    // the loop below that deals with single counter instructions.
+    if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
+      unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
+      Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
+                                           AMDGPU::OpName::simm16, NewEnc);
+      Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
+      ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
+      ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
+      Wait.LoadCnt = ~0u;
+      Wait.DsCnt = ~0u;
+
+      LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+                     ? dbgs() << "applyPreexistingWaitcnt\n"
+                              << "New Instr at block end: "
+                              << *CombinedLoadDsCntInstr << '\n'
+                     : dbgs() << "applyPreexistingWaitcnt\n"
+                              << "Old Instr: " << *It << "New Instr: "
+                              << *CombinedLoadDsCntInstr << '\n');
+    } else {
+      CombinedLoadDsCntInstr->eraseFromParent();
+      Modified = true;
+    }
+  }
+
+  if (CombinedStoreDsCntInstr) {
+    // Similarly for S_WAIT_STORECNT_DSCNT.
+    if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
+      unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
+      Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
+                                           AMDGPU::OpName::simm16, NewEnc);
+      Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
+      ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
+      ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
+      Wait.StoreCnt = ~0u;
+      Wait.DsCnt = ~0u;
+
+      LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+                     ? dbgs() << "applyPreexistingWaitcnt\n"
+                              << "New Instr at block end: "
+                              << *CombinedStoreDsCntInstr << '\n'
+                     : dbgs() << "applyPreexistingWaitcnt\n"
+                              << "Old Instr: " << *It << "New Instr: "
+                              << *CombinedStoreDsCntInstr << '\n');
+    } else {
+      CombinedStoreDsCntInstr->eraseFromParent();
+      Modified = true;
+    }
+  }
+
+  // Look for an opportunity to convert existing S_WAIT_LOADCNT,
+  // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
+  // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
+  // instructions so that createNewWaitcnt() will create new combined
+  // instructions to replace them.
+
+  if (Wait.DsCnt != ~0u) {
+    // This is a vector of addresses in WaitInstrs pointing to instructions
+    // that should be removed if they are present.
+    SmallVector<MachineInstr **, 2> WaitsToErase;
+
+    // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
+    // both) need to be waited for, ensure that there are no existing
+    // individual wait count instructions for these.
+
+    if (Wait.LoadCnt != ~0u) {
+      WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
+      WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
+    } else if (Wait.StoreCnt != ~0u) {
+      WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
+      WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
+    }
+
+    for (MachineInstr **WI : WaitsToErase) {
+      if (!*WI)
+        continue;
+
+      (*WI)->eraseFromParent();
+      *WI = nullptr;
+      Modified = true;
+    }
+  }
+
+  for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
+    if (!WaitInstrs[CT])
+      continue;
+
+    unsigned NewCnt = getWait(Wait, CT);
+    if (NewCnt != ~0u) {
+      Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
+                                           AMDGPU::OpName::simm16, NewCnt);
+      Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
+
+      ScoreBrackets.applyWaitcnt(CT, NewCnt);
+      setNoWait(Wait, CT);
+
+      LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+                     ? dbgs() << "applyPreexistingWaitcnt\n"
+                              << "New Instr at block end: " << *WaitInstrs[CT]
+                              << '\n'
+                     : dbgs() << "applyPreexistingWaitcnt\n"
+                              << "Old Instr: " << *It
+                              << "New Instr: " << *WaitInstrs[CT] << '\n');
+    } else {
+      WaitInstrs[CT]->eraseFromParent();
+      Modified = true;
+    }
+  }
+
+  return Modified;
+}
+
+/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
+bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
+    MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
+    AMDGPU::Waitcnt Wait) {
+  assert(ST);
+  assert(!isNormalMode(MaxCounter));
+
+  bool Modified = false;
+  const DebugLoc &DL = Block.findDebugLoc(It);
+
+  // Check for opportunities to use combined wait instructions.
+  if (Wait.DsCnt != ~0u) {
+    MachineInstr *SWaitInst = nullptr;
+
+    if (Wait.LoadCnt != ~0u) {
+      unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
+
+      SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
+                      .addImm(Enc);
+
+      Wait.LoadCnt = ~0u;
+      Wait.DsCnt = ~0u;
+    } else if (Wait.StoreCnt != ~0u) {
+      unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
+
+      SWaitInst =
+          BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
+              .addImm(Enc);
+
+      Wait.StoreCnt = ~0u;
+      Wait.DsCnt = ~0u;
+    }
+
+    if (SWaitInst) {
+      Modified = true;
+
+      LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+                 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+                 dbgs() << "New Instr: " << *SWaitInst << '\n');
+    }
+  }
+
+  // Generate an instruction for any remaining counter that needs
+  // waiting for.
+
+  for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
+    unsigned Count = getWait(Wait, CT);
+    if (Count == ~0u)
+      continue;
+
+    auto SWaitInst =
+        BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
+            .addImm(Count);
+
+    Modified = true;
+
+    LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+               dbgs() << "New Instr: " << *SWaitInst << '\n');
+  }
+
+  return Modified;
+}
+
 static bool readsVCCZ(const MachineInstr &MI) {
   unsigned Opc = MI.getOpcode();
   return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
@@ -1027,7 +1554,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
       MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
       MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
       MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
-    Wait.VmCnt = 0;
+    Wait.LoadCnt = 0;
   }
 
   // All waits must be resolved at call return.
@@ -1037,16 +1564,19 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
       MI.getOpcode() == AMDGPU::SI_RETURN ||
       MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
       (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
-    Wait = Wait.combined(AMDGPU::Waitcnt::allZeroExceptVsCnt());
+    Wait = Wait.combined(
+        AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts()));
   }
   // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
   // stores. In this case it can be useful to send a message to explicitly
   // release all VGPRs before the stores have completed, but it is only safe to
-  // do this if there are no outstanding scratch stores.
+  // do this if:
+  // * there are no outstanding scratch stores
+  // * we are not in Dynamic VGPR mode
   else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
            MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
     if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !OptNone &&
-        ScoreBrackets.getScoreRange(VS_CNT) != 0 &&
+        ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
         !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
       ReleaseVGPRInsts.insert(&MI);
   }
@@ -1056,7 +1586,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
            ST->hasLegacyGeometry() &&
            ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
             AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {
-    Wait.VmCnt = 0;
+    Wait.LoadCnt = 0;
   }
 #if 0 // TODO: the following blocks of logic when we have fence.
   else if (MI.getOpcode() == SC_FENCE) {
@@ -1073,12 +1603,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
         case SCMEM_LDS:
           if (group_is_multi_wave ||
             context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
-            EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
-                               ScoreBrackets->getScoreUB(LGKM_CNT));
-            // LDS may have to wait for VM_CNT after buffer load to LDS
+            EmitWaitcnt |= ScoreBrackets->updateByWait(DS_CNT,
+                               ScoreBrackets->getScoreUB(DS_CNT));
+            // LDS may have to wait for VMcnt after buffer load to LDS
             if (target_info->HasBufferLoadToLDS()) {
-              EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
-                                 ScoreBrackets->getScoreUB(VM_CNT));
+              EmitWaitcnt |= ScoreBrackets->updateByWait(LOAD_CNT,
+                                 ScoreBrackets->getScoreUB(LOAD_CNT));
             }
           }
           break;
@@ -1087,8 +1617,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
           if (group_is_multi_wave || fence_is_global) {
             EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
               ScoreBrackets->getScoreUB(EXP_CNT));
-            EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
-              ScoreBrackets->getScoreUB(LGKM_CNT));
+            EmitWaitcnt |= ScoreBrackets->updateByWait(DS_CNT,
+              ScoreBrackets->getScoreUB(DS_CNT));
           }
           break;
 
@@ -1099,8 +1629,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
           if (group_is_multi_wave || fence_is_global) {
             EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
               ScoreBrackets->getScoreUB(EXP_CNT));
-            EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
-              ScoreBrackets->getScoreUB(VM_CNT));
+            EmitWaitcnt |= ScoreBrackets->updateByWait(LOAD_CNT,
+              ScoreBrackets->getScoreUB(LOAD_CNT));
           }
           break;
 
@@ -1143,7 +1673,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
 
         for (int RegNo = CallAddrOpInterval.first;
              RegNo < CallAddrOpInterval.second; ++RegNo)
-          ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait);
+          ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
 
         int RtnAddrOpIdx =
           AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
@@ -1153,7 +1683,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
 
           for (int RegNo = RtnAddrOpInterval.first;
                RegNo < RtnAddrOpInterval.second; ++RegNo)
-            ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait);
+            ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
         }
       }
     } else {
@@ -1170,10 +1700,11 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
       // instruction to guarantee the right WAW order.
       // 2) If a destination operand that was used by a recent export/store ins,
       // add s_waitcnt on exp_cnt to guarantee the WAR order.
+
       for (const MachineMemOperand *Memop : MI.memoperands()) {
         const Value *Ptr = Memop->getValue();
         if (Memop->isStore() && SLoadAddresses.count(Ptr)) {
-          addWait(Wait, LGKM_CNT, 0);
+          addWait(Wait, SmemAccessCounter, 0);
           if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second))
             SLoadAddresses.erase(Ptr);
         }
@@ -1184,8 +1715,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
         if (TII->mayWriteLDSThroughDMA(MI))
           continue;
         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
-        // VM_CNT is only relevant to vgpr or LDS.
-        ScoreBrackets.determineWait(VM_CNT, RegNo, Wait);
+        // LOAD_CNT is only relevant to vgpr or LDS.
+        ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
         if (Memop->isStore()) {
           ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
         }
@@ -1213,14 +1744,18 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
             if (Op.isUse() || !updateVMCntOnly(MI) ||
                 ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
                                                        getVmemType(MI))) {
-              ScoreBrackets.determineWait(VM_CNT, RegNo, Wait);
+              ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
+              ScoreBrackets.determineWait(SAMPLE_CNT, RegNo, Wait);
+              ScoreBrackets.determineWait(BVH_CNT, RegNo, Wait);
               ScoreBrackets.clearVgprVmemTypes(RegNo);
             }
             if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
               ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
             }
+            ScoreBrackets.determineWait(DS_CNT, RegNo, Wait);
+          } else {
+            ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
           }
-          ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait);
         }
       }
     }
@@ -1232,7 +1767,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
   // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
   if (MI.getOpcode() == AMDGPU::S_BARRIER &&
       !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
-    Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
+    Wait = Wait.combined(
+        AMDGPU::Waitcnt::allZero(ST->hasExtendedWaitCounts(), ST->hasVscnt()));
   }
 
   // TODO: Remove this work-around, enable the assert for Bug 457939
@@ -1240,7 +1776,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
   //       independent of target.
   if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
     if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
-      Wait.LgkmCnt = 0;
+      Wait.DsCnt = 0;
     }
   }
 
@@ -1248,35 +1784,54 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
   ScoreBrackets.simplifyWaitcnt(Wait);
 
   if (ForceEmitZeroWaitcnts)
-    Wait = AMDGPU::Waitcnt::allZeroExceptVsCnt();
+    Wait = AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts());
 
-  if (ForceEmitWaitcnt[VM_CNT])
-    Wait.VmCnt = 0;
+  if (ForceEmitWaitcnt[LOAD_CNT])
+    Wait.LoadCnt = 0;
   if (ForceEmitWaitcnt[EXP_CNT])
     Wait.ExpCnt = 0;
-  if (ForceEmitWaitcnt[LGKM_CNT])
-    Wait.LgkmCnt = 0;
+  if (ForceEmitWaitcnt[DS_CNT])
+    Wait.DsCnt = 0;
+  if (ForceEmitWaitcnt[SAMPLE_CNT])
+    Wait.SampleCnt = 0;
+  if (ForceEmitWaitcnt[BVH_CNT])
+    Wait.BvhCnt = 0;
+  if (ForceEmitWaitcnt[KM_CNT])
+    Wait.KmCnt = 0;
 
   if (FlushVmCnt) {
-    if (ScoreBrackets.hasPendingEvent(VM_CNT))
-      Wait.VmCnt = 0;
+    if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
+      Wait.LoadCnt = 0;
+    if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
+      Wait.SampleCnt = 0;
+    if (ScoreBrackets.hasPendingEvent(BVH_CNT))
+      Wait.BvhCnt = 0;
   }
 
   return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
                          OldWaitcntInstr);
 }
 
-// Add a waitcnt to flush the vmcnt counter at the end of the given block if
-// needed.
+// Add a waitcnt to flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the
+// end of the given block if needed.
 bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block,
                                                WaitcntBrackets &ScoreBrackets,
                                                MachineInstr *OldWaitcntInstr) {
   AMDGPU::Waitcnt Wait;
 
-  if (!ScoreBrackets.hasPendingEvent(VM_CNT))
+  unsigned LoadCntPending = ScoreBrackets.hasPendingEvent(LOAD_CNT);
+  unsigned SampleCntPending = ScoreBrackets.hasPendingEvent(SAMPLE_CNT);
+  unsigned BvhCntPending = ScoreBrackets.hasPendingEvent(BVH_CNT);
+
+  if (LoadCntPending == 0 && SampleCntPending == 0 && BvhCntPending == 0)
     return false;
 
-  Wait.VmCnt = 0;
+  if (LoadCntPending != 0)
+    Wait.LoadCnt = 0;
+  if (SampleCntPending != 0)
+    Wait.SampleCnt = 0;
+  if (BvhCntPending != 0)
+    Wait.BvhCnt = 0;
 
   return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
                          OldWaitcntInstr);
@@ -1288,15 +1843,16 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
                                        WaitcntBrackets &ScoreBrackets,
                                        MachineInstr *OldWaitcntInstr) {
   bool Modified = false;
-  const DebugLoc &DL = Block.findDebugLoc(It);
 
   if (OldWaitcntInstr)
     // Try to merge the required wait with preexisting waitcnt instructions.
     // Also erase redundant waitcnt.
     Modified =
-        applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
-  else
-    ScoreBrackets.applyWaitcnt(Wait);
+        WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
+
+  // Any counts that could have been applied to any existing waitcnt
+  // instructions will have been done so, now deal with any remaining.
+  ScoreBrackets.applyWaitcnt(Wait);
 
   // ExpCnt can be merged into VINTERP.
   if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
@@ -1309,35 +1865,13 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
     }
     Wait.ExpCnt = ~0u;
 
-    LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
+    LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
                       << "Update Instr: " << *It);
   }
 
-  // Build new waitcnt instructions unless no wait is needed or the old waitcnt
-  // instruction was modified to handle the required wait.
-  if (Wait.hasWaitExceptVsCnt()) {
-    unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
-    [[maybe_unused]] auto SWaitInst =
-        BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+  if (WCG->createNewWaitcnt(Block, It, Wait))
     Modified = true;
 
-    LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
-               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
-               dbgs() << "New Instr: " << *SWaitInst << '\n');
-  }
-
-  if (Wait.hasWaitVsCnt()) {
-    assert(ST->hasVscnt());
-
-    [[maybe_unused]] auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
-                         .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
-                         .addImm(Wait.VsCnt);
-    Modified = true;
-
-    LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
-               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
-               dbgs() << "New Instr: " << *SWaitInst << '\n');
-  }
   return Modified;
 }
 
@@ -1435,7 +1969,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
   // Now look at the instruction opcode. If it is a memory access
   // instruction, update the upper-bound of the appropriate counter's
   // bracket and the destination operand scores.
-  // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
+  // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
+
   if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
     if (TII->isAlwaysGDS(Inst.getOpcode()) ||
         TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
@@ -1486,7 +2021,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
   } else if (Inst.isCall()) {
     if (callWaitsOnFunctionReturn(Inst)) {
       // Act as a wait on everything
-      ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZeroExceptVsCnt());
+      ScoreBrackets->applyWaitcnt(
+          AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts()));
     } else {
       // May need to way wait for anything.
       ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
@@ -1545,7 +2081,7 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
   VgprUB = std::max(VgprUB, Other.VgprUB);
   SgprUB = std::max(SgprUB, Other.SgprUB);
 
-  for (auto T : inst_counter_types()) {
+  for (auto T : inst_counter_types(MaxCounter)) {
     // Merge event flags for this counter
     const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
     const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
@@ -1573,7 +2109,7 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
     for (int J = 0; J <= VgprUB; J++)
       StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
 
-    if (T == LGKM_CNT) {
+    if (T == SmemAccessCounter) {
       for (int J = 0; J <= SgprUB; J++)
         StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
     }
@@ -1589,10 +2125,13 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
 }
 
 static bool isWaitInstr(MachineInstr &Inst) {
-  auto Opcode = Inst.getOpcode();
-  return SIInstrInfo::isWaitcnt(Opcode) ||
-         (SIInstrInfo::isWaitcntVsCnt(Opcode) && Inst.getOperand(0).isReg() &&
-          Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
+  unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
+  return Opcode == AMDGPU::S_WAITCNT ||
+         (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
+          Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
+         Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
+         Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
+         counterTypeForInstr(Opcode).has_value();
 }
 
 // Generate s_waitcnt instructions where needed.
@@ -1698,8 +2237,8 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
     // an S_WAITCNT vmcnt(0)
     if (RequireCheckResourceType(Inst, context)) {
       // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
-      ScoreBrackets->setScoreLB(VM_CNT,
-      ScoreBrackets->getScoreUB(VM_CNT));
+      ScoreBrackets->setScoreLB(LOAD_CNT,
+      ScoreBrackets->getScoreUB(LOAD_CNT));
     }
 #endif
 
@@ -1801,7 +2340,12 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
             VgprUse.insert(RegNo);
             // If at least one of Op's registers is in the score brackets, the
             // value is likely loaded outside of the loop.
-            if (Brackets.getRegScore(RegNo, VM_CNT) > Brackets.getScoreLB(VM_CNT)) {
+            if (Brackets.getRegScore(RegNo, LOAD_CNT) >
+                    Brackets.getScoreLB(LOAD_CNT) ||
+                Brackets.getRegScore(RegNo, SAMPLE_CNT) >
+                    Brackets.getScoreLB(SAMPLE_CNT) ||
+                Brackets.getRegScore(RegNo, BVH_CNT) >
+                    Brackets.getScoreLB(BVH_CNT)) {
               UsesVgprLoadedOutside = true;
               break;
             }
@@ -1829,23 +2373,46 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   TII = ST->getInstrInfo();
   TRI = &TII->getRegisterInfo();
   MRI = &MF.getRegInfo();
-  IV = AMDGPU::getIsaVersion(ST->getCPU());
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   MLI = &getAnalysis<MachineLoopInfo>();
   PDT = &getAnalysis<MachinePostDominatorTree>();
 
+  AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
+
+  if (ST->hasExtendedWaitCounts()) {
+    MaxCounter = NUM_EXTENDED_INST_CNTS;
+    WCGGFX12Plus = WaitcntGeneratorGFX12Plus(ST, MaxCounter);
+    WCG = &WCGGFX12Plus;
+  } else {
+    MaxCounter = NUM_NORMAL_INST_CNTS;
+    WCGPreGFX12 = WaitcntGeneratorPreGFX12(ST);
+    WCG = &WCGPreGFX12;
+  }
+
   ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
   for (auto T : inst_counter_types())
     ForceEmitWaitcnt[T] = false;
 
+  const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
+
+  SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
+
   OptNone = MF.getFunction().hasOptNone() ||
             MF.getTarget().getOptLevel() == CodeGenOptLevel::None;
 
   HardwareLimits Limits = {};
-  Limits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
+  if (ST->hasExtendedWaitCounts()) {
+    Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
+    Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
+  } else {
+    Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV);
+    Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV);
+  }
   Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
-  Limits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
-  Limits.VscntMax = ST->hasVscnt() ? 63 : 0;
+  Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV);
+  Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
+  Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
+  Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
 
   unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
   unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
@@ -1863,6 +2430,9 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   BlockInfos.clear();
   bool Modified = false;
 
+  MachineBasicBlock &EntryBB = MF.front();
+  MachineBasicBlock::iterator I = EntryBB.begin();
+
   if (!MFI->isEntryFunction()) {
     // Wait for any outstanding memory operations that the input registers may
     // depend on. We can't track them and it's better to do the wait after the
@@ -1870,15 +2440,28 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
 
     // TODO: Could insert earlier and schedule more liberally with operations
     // that only use caller preserved registers.
-    MachineBasicBlock &EntryBB = MF.front();
-    MachineBasicBlock::iterator I = EntryBB.begin();
     for (MachineBasicBlock::iterator E = EntryBB.end();
          I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
       ;
-    BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
 
-    auto NonKernelInitialState =
-        std::make_unique<WaitcntBrackets>(ST, Limits, Encoding);
+    if (ST->hasExtendedWaitCounts()) {
+      BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
+          .addImm(0);
+      for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
+        if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT)
+          continue;
+
+        BuildMI(EntryBB, I, DebugLoc(),
+                TII->get(instrsForExtendedCounterTypes[CT]))
+            .addImm(0);
+      }
+    } else {
+      BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
+    }
+
+    auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
+        ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
+        SmemAccessCounter);
     NonKernelInitialState->setNonKernelFunctionInitialState();
     BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
 
@@ -1909,9 +2492,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
           *Brackets = *BI.Incoming;
       } else {
         if (!Brackets)
-          Brackets = std::make_unique<WaitcntBrackets>(ST, Limits, Encoding);
+          Brackets = std::make_unique<WaitcntBrackets>(
+              ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
+              SmemAccessCounter);
         else
-          *Brackets = WaitcntBrackets(ST, Limits, Encoding);
+          *Brackets = WaitcntBrackets(ST, MaxCounter, Limits, Encoding,
+                                      WaitEventMaskForInst, SmemAccessCounter);
       }
 
       Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d4c7a457e9aae2..5380f6d1fc88f6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -9065,8 +9065,7 @@ bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
 }
 
 int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
-  if (SIInstrInfo::isSoftWaitcnt(Opcode))
-    Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
+  Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
 
   unsigned Gen = subtargetEncodingFamily(ST);
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 37ee159362a28c..6d0d818df84aa6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -905,29 +905,24 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   }
 
   static unsigned getNonSoftWaitcntOpcode(unsigned Opcode) {
-    if (isWaitcnt(Opcode))
+    switch (Opcode) {
+    case AMDGPU::S_WAITCNT_soft:
       return AMDGPU::S_WAITCNT;
-
-    if (isWaitcntVsCnt(Opcode))
+    case AMDGPU::S_WAITCNT_VSCNT_soft:
       return AMDGPU::S_WAITCNT_VSCNT;
-
-    llvm_unreachable("Expected opcode S_WAITCNT/S_WAITCNT_VSCNT");
-  }
-
-  static bool isWaitcnt(unsigned Opcode) {
-    return Opcode == AMDGPU::S_WAITCNT || Opcode == AMDGPU::S_WAITCNT_soft;
-  }
-
-  static bool isWaitcntVsCnt(unsigned Opcode) {
-    return Opcode == AMDGPU::S_WAITCNT_VSCNT ||
-           Opcode == AMDGPU::S_WAITCNT_VSCNT_soft;
-  }
-
-  // "Soft" waitcnt instructions can be relaxed/optimized out by
-  // SIInsertWaitcnts.
-  static bool isSoftWaitcnt(unsigned Opcode) {
-    return Opcode == AMDGPU::S_WAITCNT_soft ||
-           Opcode == AMDGPU::S_WAITCNT_VSCNT_soft;
+    case AMDGPU::S_WAIT_LOADCNT_soft:
+      return AMDGPU::S_WAIT_LOADCNT;
+    case AMDGPU::S_WAIT_STORECNT_soft:
+      return AMDGPU::S_WAIT_STORECNT;
+    case AMDGPU::S_WAIT_SAMPLECNT_soft:
+      return AMDGPU::S_WAIT_SAMPLECNT;
+    case AMDGPU::S_WAIT_BVHCNT_soft:
+      return AMDGPU::S_WAIT_BVHCNT;
+    case AMDGPU::S_WAIT_DSCNT_soft:
+      return AMDGPU::S_WAIT_DSCNT;
+    default:
+      return Opcode;
+    }
   }
 
   bool isVGPRCopy(const MachineInstr &MI) const {
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 6d749ad1ad24f6..84b9330ef9633e 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -579,11 +579,30 @@ class SIGfx11CacheControl : public SIGfx10CacheControl {
 };
 
 class SIGfx12CacheControl : public SIGfx11CacheControl {
+protected:
+  // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
+  // \returns Returns true if \p MI is modified, false otherwise.
+  bool setTH(const MachineBasicBlock::iterator MI,
+             AMDGPU::CPol::CPol Value) const;
+  // Sets Scope policy to \p Value if CPol operand is present in instruction \p
+  // MI. \returns Returns true if \p MI is modified, false otherwise.
+  bool setScope(const MachineBasicBlock::iterator MI,
+                AMDGPU::CPol::CPol Value) const;
+
 public:
   SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
 
+  bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+                  SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+                  bool IsCrossAddrSpaceOrdering, Position Pos) const override;
+
   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
+
+  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
+                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+                                      bool IsVolatile,
+                                      bool IsNonTemporal) const override;
 };
 
 class SIMemoryLegalizer final : public MachineFunctionPass {
@@ -2142,6 +2161,132 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
   return Changed;
 }
 
+bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
+                                AMDGPU::CPol::CPol Value) const {
+  MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
+  if (!CPol)
+    return false;
+
+  uint64_t NewTH = Value & AMDGPU::CPol::TH;
+  if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
+    CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
+    return true;
+  }
+
+  return false;
+}
+
+bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
+                                   AMDGPU::CPol::CPol Value) const {
+  MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
+  if (!CPol)
+    return false;
+
+  uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
+  if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
+    CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
+    return true;
+  }
+
+  return false;
+}
+
+bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
+                                     SIAtomicScope Scope,
+                                     SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+                                     bool IsCrossAddrSpaceOrdering,
+                                     Position Pos) const {
+  bool Changed = false;
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+
+  bool LOADCnt = false;
+  bool DSCnt = false;
+  bool STORECnt = false;
+
+  if (Pos == Position::AFTER)
+    ++MI;
+
+  if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
+      SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+      if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
+        LOADCnt |= true;
+      if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
+        STORECnt |= true;
+      break;
+    case SIAtomicScope::WORKGROUP:
+      // In WGP mode the waves of a work-group can be executing on either CU of
+      // the WGP. Therefore need to wait for operations to complete to ensure
+      // they are visible to waves in the other CU as the L0 is per CU.
+      // Otherwise in CU mode and all waves of a work-group are on the same CU
+      // which shares the same L0.
+      if (!ST.isCuModeEnabled()) {
+        if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
+          LOADCnt |= true;
+        if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
+          STORECnt |= true;
+      }
+      break;
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // The L0 cache keeps all memory operations in order for
+      // work-items in the same wavefront.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+    case SIAtomicScope::WORKGROUP:
+      // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
+      // not needed as LDS operations for all waves are executed in a total
+      // global ordering as observed by all waves. Required if also
+      // synchronizing with global/GDS memory as LDS operations could be
+      // reordered with respect to later global/GDS memory operations of the
+      // same wave.
+      DSCnt |= IsCrossAddrSpaceOrdering;
+      break;
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // The LDS keeps all memory operations in order for
+      // the same wavefront.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  if (LOADCnt) {
+    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
+    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
+    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
+    Changed = true;
+  }
+
+  if (STORECnt) {
+    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
+    Changed = true;
+  }
+
+  if (DSCnt) {
+    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
+    Changed = true;
+  }
+
+  if (Pos == Position::AFTER)
+    --MI;
+
+  return Changed;
+}
+
 bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
                                         SIAtomicScope Scope,
                                         SIAtomicAddrSpace AddrSpace,
@@ -2198,6 +2343,41 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
   return true;
 }
 
+bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
+    MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+    bool IsVolatile, bool IsNonTemporal) const {
+
+  // Only handle load and store, not atomic read-modify-write instructions.
+  assert(MI->mayLoad() ^ MI->mayStore());
+
+  // Only update load and store, not LLVM IR atomic read-modify-write
+  // instructions. The latter are always marked as volatile so cannot sensibly
+  // handle it as do not want to pessimize all atomics. Also they do not support
+  // the nontemporal attribute.
+  assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
+
+  bool Changed = false;
+
+  if (IsVolatile) {
+    Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
+
+    // Ensure operation has completed at system scope to cause all volatile
+    // operations to be visible outside the program in a global order. Do not
+    // request cross address space as only the global address space can be
+    // observable outside the program, so no need to cause a waitcnt for LDS
+    // address space operations.
+    Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
+                          Position::AFTER);
+  }
+
+  if (IsNonTemporal) {
+    // Set non-temporal hint for all cache levels.
+    Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
+  }
+
+  return Changed;
+}
+
 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
   if (AtomicPseudoMIs.empty())
     return false;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 46fa3d57a21cb2..53b4faeb7e0345 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1597,6 +1597,13 @@ def S_WAITCNT : SOPP_Pseudo <"s_waitcnt" , (ins SWaitCnt:$simm16), "$simm16",
 // that doesn't access memory.
 def S_WAITCNT_soft : SOPP_Pseudo <"s_soft_waitcnt" , (ins SWaitCnt:$simm16), "$simm16">;
 def S_WAITCNT_VSCNT_soft : SOPK_WAITCNT<"s_soft_waitcnt_vscnt">;
+let SubtargetPredicate = isGFX12Plus in {
+  def S_WAIT_LOADCNT_soft : SOPP_Pseudo <"s_soft_wait_loadcnt", (ins s16imm:$simm16), "$simm16">;
+  def S_WAIT_STORECNT_soft : SOPP_Pseudo <"s_soft_wait_storecnt", (ins s16imm:$simm16), "$simm16">;
+  def S_WAIT_SAMPLECNT_soft : SOPP_Pseudo <"s_soft_wait_samplecnt", (ins s16imm:$simm16), "$simm16">;
+  def S_WAIT_BVHCNT_soft : SOPP_Pseudo <"s_soft_wait_bvhcnt", (ins s16imm:$simm16), "$simm16">;
+  def S_WAIT_DSCNT_soft : SOPP_Pseudo <"s_soft_wait_dscnt", (ins s16imm:$simm16), "$simm16">;
+}
 
 def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
     [(int_amdgcn_s_sethalt timm:$simm16)]>;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 26ba2575ff34ac..b4f7fc456f0bdd 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -94,6 +94,44 @@ unsigned getVmcntBitWidthHi(unsigned VersionMajor) {
   return (VersionMajor == 9 || VersionMajor == 10) ? 2 : 0;
 }
 
+/// \returns Loadcnt bit width
+unsigned getLoadcntBitWidth(unsigned VersionMajor) {
+  return VersionMajor >= 12 ? 6 : 0;
+}
+
+/// \returns Samplecnt bit width.
+unsigned getSamplecntBitWidth(unsigned VersionMajor) {
+  return VersionMajor >= 12 ? 6 : 0;
+}
+
+/// \returns Bvhcnt bit width.
+unsigned getBvhcntBitWidth(unsigned VersionMajor) {
+  return VersionMajor >= 12 ? 3 : 0;
+}
+
+/// \returns Dscnt bit width.
+unsigned getDscntBitWidth(unsigned VersionMajor) {
+  return VersionMajor >= 12 ? 6 : 0;
+}
+
+/// \returns Dscnt bit shift in combined S_WAIT instructions.
+unsigned getDscntBitShift(unsigned VersionMajor) { return 0; }
+
+/// \returns Storecnt or Vscnt bit width, depending on VersionMajor.
+unsigned getStorecntBitWidth(unsigned VersionMajor) {
+  return VersionMajor >= 10 ? 6 : 0;
+}
+
+/// \returns Kmcnt bit width.
+unsigned getKmcntBitWidth(unsigned VersionMajor) {
+  return VersionMajor >= 12 ? 5 : 0;
+}
+
+/// \returns shift for Loadcnt/Storecnt in combined S_WAIT instructions.
+unsigned getLoadcntStorecntBitShift(unsigned VersionMajor) {
+  return VersionMajor >= 12 ? 8 : 0;
+}
+
 /// \returns VmVsrc bit width
 inline unsigned getVmVsrcBitWidth() { return 3; }
 
@@ -1229,6 +1267,18 @@ unsigned getVmcntBitMask(const IsaVersion &Version) {
          1;
 }
 
+unsigned getLoadcntBitMask(const IsaVersion &Version) {
+  return (1 << getLoadcntBitWidth(Version.Major)) - 1;
+}
+
+unsigned getSamplecntBitMask(const IsaVersion &Version) {
+  return (1 << getSamplecntBitWidth(Version.Major)) - 1;
+}
+
+unsigned getBvhcntBitMask(const IsaVersion &Version) {
+  return (1 << getBvhcntBitWidth(Version.Major)) - 1;
+}
+
 unsigned getExpcntBitMask(const IsaVersion &Version) {
   return (1 << getExpcntBitWidth(Version.Major)) - 1;
 }
@@ -1237,6 +1287,18 @@ unsigned getLgkmcntBitMask(const IsaVersion &Version) {
   return (1 << getLgkmcntBitWidth(Version.Major)) - 1;
 }
 
+unsigned getDscntBitMask(const IsaVersion &Version) {
+  return (1 << getDscntBitWidth(Version.Major)) - 1;
+}
+
+unsigned getKmcntBitMask(const IsaVersion &Version) {
+  return (1 << getKmcntBitWidth(Version.Major)) - 1;
+}
+
+unsigned getStorecntBitMask(const IsaVersion &Version) {
+  return (1 << getStorecntBitWidth(Version.Major)) - 1;
+}
+
 unsigned getWaitcntBitMask(const IsaVersion &Version) {
   unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(Version.Major),
                                 getVmcntBitWidthLo(Version.Major));
@@ -1276,9 +1338,9 @@ void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt,
 
 Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded) {
   Waitcnt Decoded;
-  Decoded.VmCnt = decodeVmcnt(Version, Encoded);
+  Decoded.LoadCnt = decodeVmcnt(Version, Encoded);
   Decoded.ExpCnt = decodeExpcnt(Version, Encoded);
-  Decoded.LgkmCnt = decodeLgkmcnt(Version, Encoded);
+  Decoded.DsCnt = decodeLgkmcnt(Version, Encoded);
   return Decoded;
 }
 
@@ -1313,7 +1375,85 @@ unsigned encodeWaitcnt(const IsaVersion &Version,
 }
 
 unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded) {
-  return encodeWaitcnt(Version, Decoded.VmCnt, Decoded.ExpCnt, Decoded.LgkmCnt);
+  return encodeWaitcnt(Version, Decoded.LoadCnt, Decoded.ExpCnt, Decoded.DsCnt);
+}
+
+static unsigned getCombinedCountBitMask(const IsaVersion &Version,
+                                        bool IsStore) {
+  unsigned Dscnt = getBitMask(getDscntBitShift(Version.Major),
+                              getDscntBitWidth(Version.Major));
+  if (IsStore) {
+    unsigned Storecnt = getBitMask(getLoadcntStorecntBitShift(Version.Major),
+                                   getStorecntBitWidth(Version.Major));
+    return Dscnt | Storecnt;
+  } else {
+    unsigned Loadcnt = getBitMask(getLoadcntStorecntBitShift(Version.Major),
+                                  getLoadcntBitWidth(Version.Major));
+    return Dscnt | Loadcnt;
+  }
+}
+
+Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt) {
+  Waitcnt Decoded;
+  Decoded.LoadCnt =
+      unpackBits(LoadcntDscnt, getLoadcntStorecntBitShift(Version.Major),
+                 getLoadcntBitWidth(Version.Major));
+  Decoded.DsCnt = unpackBits(LoadcntDscnt, getDscntBitShift(Version.Major),
+                             getDscntBitWidth(Version.Major));
+  return Decoded;
+}
+
+Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt) {
+  Waitcnt Decoded;
+  Decoded.StoreCnt =
+      unpackBits(StorecntDscnt, getLoadcntStorecntBitShift(Version.Major),
+                 getStorecntBitWidth(Version.Major));
+  Decoded.DsCnt = unpackBits(StorecntDscnt, getDscntBitShift(Version.Major),
+                             getDscntBitWidth(Version.Major));
+  return Decoded;
+}
+
+static unsigned encodeLoadcnt(const IsaVersion &Version, unsigned Waitcnt,
+                              unsigned Loadcnt) {
+  return packBits(Loadcnt, Waitcnt, getLoadcntStorecntBitShift(Version.Major),
+                  getLoadcntBitWidth(Version.Major));
+}
+
+static unsigned encodeStorecnt(const IsaVersion &Version, unsigned Waitcnt,
+                               unsigned Storecnt) {
+  return packBits(Storecnt, Waitcnt, getLoadcntStorecntBitShift(Version.Major),
+                  getStorecntBitWidth(Version.Major));
+}
+
+static unsigned encodeDscnt(const IsaVersion &Version, unsigned Waitcnt,
+                            unsigned Dscnt) {
+  return packBits(Dscnt, Waitcnt, getDscntBitShift(Version.Major),
+                  getDscntBitWidth(Version.Major));
+}
+
+static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt,
+                                   unsigned Dscnt) {
+  unsigned Waitcnt = getCombinedCountBitMask(Version, false);
+  Waitcnt = encodeLoadcnt(Version, Waitcnt, Loadcnt);
+  Waitcnt = encodeDscnt(Version, Waitcnt, Dscnt);
+  return Waitcnt;
+}
+
+unsigned encodeLoadcntDscnt(const IsaVersion &Version, const Waitcnt &Decoded) {
+  return encodeLoadcntDscnt(Version, Decoded.LoadCnt, Decoded.DsCnt);
+}
+
+static unsigned encodeStorecntDscnt(const IsaVersion &Version,
+                                    unsigned Storecnt, unsigned Dscnt) {
+  unsigned Waitcnt = getCombinedCountBitMask(Version, true);
+  Waitcnt = encodeStorecnt(Version, Waitcnt, Storecnt);
+  Waitcnt = encodeDscnt(Version, Waitcnt, Dscnt);
+  return Waitcnt;
+}
+
+unsigned encodeStorecntDscnt(const IsaVersion &Version,
+                             const Waitcnt &Decoded) {
+  return encodeStorecntDscnt(Version, Decoded.StoreCnt, Decoded.DsCnt);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 50c741760d7143..351563e957f14a 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -837,39 +837,58 @@ getIntegerPairAttribute(const Function &F, StringRef Name,
 /// Large values (including the maximum possible integer) can be used to
 /// represent "don't care" waits.
 struct Waitcnt {
-  unsigned VmCnt = ~0u;
+  unsigned LoadCnt = ~0u; // Corresponds to Vmcnt prior to gfx12.
   unsigned ExpCnt = ~0u;
-  unsigned LgkmCnt = ~0u;
-  unsigned VsCnt = ~0u;
+  unsigned DsCnt = ~0u;     // Corresponds to LGKMcnt prior to gfx12.
+  unsigned StoreCnt = ~0u;  // Corresponds to VScnt on gfx10/gfx11.
+  unsigned SampleCnt = ~0u; // gfx12+ only.
+  unsigned BvhCnt = ~0u;    // gfx12+ only.
+  unsigned KmCnt = ~0u;     // gfx12+ only.
 
   Waitcnt() = default;
+  // Pre-gfx12 constructor.
   Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt, unsigned VsCnt)
-      : VmCnt(VmCnt), ExpCnt(ExpCnt), LgkmCnt(LgkmCnt), VsCnt(VsCnt) {}
-
-  static Waitcnt allZero(bool HasVscnt) {
-    return Waitcnt(0, 0, 0, HasVscnt ? 0 : ~0u);
+      : LoadCnt(VmCnt), ExpCnt(ExpCnt), DsCnt(LgkmCnt), StoreCnt(VsCnt),
+        SampleCnt(~0u), BvhCnt(~0u), KmCnt(~0u) {}
+
+  // gfx12+ constructor.
+  Waitcnt(unsigned LoadCnt, unsigned ExpCnt, unsigned DsCnt, unsigned StoreCnt,
+          unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt)
+      : LoadCnt(LoadCnt), ExpCnt(ExpCnt), DsCnt(DsCnt), StoreCnt(StoreCnt),
+        SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt) {}
+
+  static Waitcnt allZero(bool Extended, bool HasStorecnt) {
+    return Extended ? Waitcnt(0, 0, 0, 0, 0, 0, 0)
+                    : Waitcnt(0, 0, 0, HasStorecnt ? 0 : ~0u);
   }
-  static Waitcnt allZeroExceptVsCnt() { return Waitcnt(0, 0, 0, ~0u); }
 
-  bool hasWait() const {
-    return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u || VsCnt != ~0u;
+  static Waitcnt allZeroExceptVsCnt(bool Extended) {
+    return Extended ? Waitcnt(0, 0, 0, ~0u, 0, 0, 0) : Waitcnt(0, 0, 0, ~0u);
   }
 
-  bool hasWaitExceptVsCnt() const {
-    return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u;
-  }
+  bool hasWait() const { return StoreCnt != ~0u || hasWaitExceptStoreCnt(); }
 
-  bool hasWaitVsCnt() const {
-    return VsCnt != ~0u;
+  bool hasWaitExceptStoreCnt() const {
+    return LoadCnt != ~0u || ExpCnt != ~0u || DsCnt != ~0u ||
+           SampleCnt != ~0u || BvhCnt != ~0u || KmCnt != ~0u;
   }
 
+  bool hasWaitStoreCnt() const { return StoreCnt != ~0u; }
+
   Waitcnt combined(const Waitcnt &Other) const {
-    return Waitcnt(std::min(VmCnt, Other.VmCnt), std::min(ExpCnt, Other.ExpCnt),
-                   std::min(LgkmCnt, Other.LgkmCnt),
-                   std::min(VsCnt, Other.VsCnt));
+    // Does the right thing provided self and Other are either both pre-gfx12
+    // or both gfx12+.
+    return Waitcnt(
+        std::min(LoadCnt, Other.LoadCnt), std::min(ExpCnt, Other.ExpCnt),
+        std::min(DsCnt, Other.DsCnt), std::min(StoreCnt, Other.StoreCnt),
+        std::min(SampleCnt, Other.SampleCnt), std::min(BvhCnt, Other.BvhCnt),
+        std::min(KmCnt, Other.KmCnt));
   }
 };
 
+// The following methods are only meaningful on targets that support
+// S_WAITCNT.
+
 /// \returns Vmcnt bit mask for given isa \p Version.
 unsigned getVmcntBitMask(const IsaVersion &Version);
 
@@ -893,17 +912,19 @@ unsigned decodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt);
 
 /// Decodes Vmcnt, Expcnt and Lgkmcnt from given \p Waitcnt for given isa
 /// \p Version, and writes decoded values into \p Vmcnt, \p Expcnt and
-/// \p Lgkmcnt respectively.
+/// \p Lgkmcnt respectively. Should not be used on gfx12+, the instruction
+/// which needs it is deprecated
 ///
 /// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are decoded as follows:
 ///     \p Vmcnt = \p Waitcnt[3:0]        (pre-gfx9)
 ///     \p Vmcnt = \p Waitcnt[15:14,3:0]  (gfx9,10)
-///     \p Vmcnt = \p Waitcnt[15:10]      (gfx11+)
+///     \p Vmcnt = \p Waitcnt[15:10]      (gfx11)
 ///     \p Expcnt = \p Waitcnt[6:4]       (pre-gfx11)
-///     \p Expcnt = \p Waitcnt[2:0]       (gfx11+)
+///     \p Expcnt = \p Waitcnt[2:0]       (gfx11)
 ///     \p Lgkmcnt = \p Waitcnt[11:8]     (pre-gfx10)
 ///     \p Lgkmcnt = \p Waitcnt[13:8]     (gfx10)
-///     \p Lgkmcnt = \p Waitcnt[9:4]      (gfx11+)
+///     \p Lgkmcnt = \p Waitcnt[9:4]      (gfx11)
+///
 void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt,
                    unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt);
 
@@ -922,26 +943,78 @@ unsigned encodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt,
                        unsigned Lgkmcnt);
 
 /// Encodes \p Vmcnt, \p Expcnt and \p Lgkmcnt into Waitcnt for given isa
-/// \p Version.
+/// \p Version. Should not be used on gfx12+, the instruction which needs
+/// it is deprecated
 ///
 /// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are encoded as follows:
 ///     Waitcnt[2:0]   = \p Expcnt      (gfx11+)
 ///     Waitcnt[3:0]   = \p Vmcnt       (pre-gfx9)
 ///     Waitcnt[3:0]   = \p Vmcnt[3:0]  (gfx9,10)
 ///     Waitcnt[6:4]   = \p Expcnt      (pre-gfx11)
-///     Waitcnt[9:4]   = \p Lgkmcnt     (gfx11+)
+///     Waitcnt[9:4]   = \p Lgkmcnt     (gfx11)
 ///     Waitcnt[11:8]  = \p Lgkmcnt     (pre-gfx10)
 ///     Waitcnt[13:8]  = \p Lgkmcnt     (gfx10)
-///     Waitcnt[15:10] = \p Vmcnt       (gfx11+)
+///     Waitcnt[15:10] = \p Vmcnt       (gfx11)
 ///     Waitcnt[15:14] = \p Vmcnt[5:4]  (gfx9,10)
 ///
 /// \returns Waitcnt with encoded \p Vmcnt, \p Expcnt and \p Lgkmcnt for given
 /// isa \p Version.
+///
 unsigned encodeWaitcnt(const IsaVersion &Version,
                        unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt);
 
 unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded);
 
+// The following methods are only meaningful on targets that support
+// S_WAIT_*CNT, introduced with gfx12.
+
+/// \returns Loadcnt bit mask for given isa \p Version.
+/// Returns 0 for versions that do not support LOADcnt
+unsigned getLoadcntBitMask(const IsaVersion &Version);
+
+/// \returns Samplecnt bit mask for given isa \p Version.
+/// Returns 0 for versions that do not support SAMPLEcnt
+unsigned getSamplecntBitMask(const IsaVersion &Version);
+
+/// \returns Bvhcnt bit mask for given isa \p Version.
+/// Returns 0 for versions that do not support BVHcnt
+unsigned getBvhcntBitMask(const IsaVersion &Version);
+
+/// \returns Dscnt bit mask for given isa \p Version.
+/// Returns 0 for versions that do not support DScnt
+unsigned getDscntBitMask(const IsaVersion &Version);
+
+/// \returns Dscnt bit mask for given isa \p Version.
+/// Returns 0 for versions that do not support KMcnt
+unsigned getKmcntBitMask(const IsaVersion &Version);
+
+/// \return STOREcnt or VScnt bit mask for given isa \p Version.
+/// returns 0 for versions that do not support STOREcnt or VScnt.
+/// STOREcnt and VScnt are the same counter, the name used
+/// depends on the ISA version.
+unsigned getStorecntBitMask(const IsaVersion &Version);
+
+// The following are only meaningful on targets that support
+// S_WAIT_LOADCNT_DSCNT and S_WAIT_STORECNT_DSCNT.
+
+/// \returns Decoded Waitcnt structure from given \p LoadcntDscnt for given
+/// isa \p Version.
+Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt);
+
+/// \returns Decoded Waitcnt structure from given \p StorecntDscnt for given
+/// isa \p Version.
+Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt);
+
+/// \returns \p Loadcnt and \p Dscnt components of \p Decoded  encoded as an
+/// immediate that can be used with S_WAIT_LOADCNT_DSCNT for given isa
+/// \p Version.
+unsigned encodeLoadcntDscnt(const IsaVersion &Version, const Waitcnt &Decoded);
+
+/// \returns \p Storecnt and \p Dscnt components of \p Decoded  encoded as an
+/// immediate that can be used with S_WAIT_STORECNT_DSCNT for given isa
+/// \p Version.
+unsigned encodeStorecntDscnt(const IsaVersion &Version, const Waitcnt &Decoded);
+
 namespace Hwreg {
 
 LLVM_READONLY
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
index b850c37c4a2810..e3243ff186e3ee 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
@@ -25,7 +25,7 @@ define amdgpu_kernel void @s_add_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[6:7], s[0:1]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
@@ -77,7 +77,7 @@ define amdgpu_kernel void @s_sub_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_sub_nc_u64 s[0:1], s[6:7], s[0:1]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll
index 62e5bce23664cd..c7676e9da6f499 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll
@@ -11,7 +11,11 @@ define float @test_fmed3_f32_known_nnan_ieee_true(float %a) #0 {
 ;
 ; GFX12-LABEL: test_fmed3_f32_known_nnan_ieee_true:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mul_f32_e64 v0, v0, 2.0 clamp
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul float %a, 2.0
@@ -28,7 +32,11 @@ define half @test_fmed3_f16_known_nnan_ieee_false(half %a) #1 {
 ;
 ; GFX12-LABEL: test_fmed3_f16_known_nnan_ieee_false:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mul_f16_e64 v0, v0, 2.0 clamp
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul half %a, 2.0
@@ -47,7 +55,11 @@ define float @test_fmed3_non_SNaN_input_ieee_true_dx10clamp_true(float %a) #2 {
 ;
 ; GFX12-LABEL: test_fmed3_non_SNaN_input_ieee_true_dx10clamp_true:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_min_num_f32_e64 v0, 0x41200000, v0 clamp
@@ -67,7 +79,11 @@ define float @test_fmed3_maybe_SNaN_input_zero_third_operand_ieee_true_dx10clamp
 ;
 ; GFX12-LABEL: test_fmed3_maybe_SNaN_input_zero_third_operand_ieee_true_dx10clamp_true:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mul_f32_e64 v0, v0, 2.0 clamp
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul float %a, 2.0
@@ -86,7 +102,11 @@ define float @test_fmed3_global_nnan(float %a) #3 {
 ;
 ; GFX12-LABEL: test_fmed3_global_nnan:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mul_f32_e64 v0, v0, 2.0 clamp
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul float %a, 2.0
@@ -109,7 +129,11 @@ define float @test_fmed3_f32_maybe_NaN_ieee_false(float %a) #1 {
 ;
 ; GFX12-LABEL: test_fmed3_f32_maybe_NaN_ieee_false:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mul_f32_e64 v0, v0, 2.0 clamp
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul float %a, 2.0
@@ -129,7 +153,11 @@ define float @test_fmed3_non_SNaN_input_ieee_true_dx10clamp_false(float %a) #4 {
 ;
 ; GFX12-LABEL: test_fmed3_non_SNaN_input_ieee_true_dx10clamp_false:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_min_num_f32_e64 v0, 0x41200000, v0 clamp
@@ -149,7 +177,11 @@ define float @test_fmed3_maybe_SNaN_input_ieee_true_dx10clamp_true(float %a) #2
 ;
 ; GFX12-LABEL: test_fmed3_maybe_SNaN_input_ieee_true_dx10clamp_true:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mul_f32_e64 v0, v0, 2.0 clamp
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul float %a, 2.0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
index bba3687dbbc2cc..ca0047bba6c4bd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
@@ -11,7 +11,11 @@ define float @test_min_max_ValK0_K1_f32(float %a) #0 {
 ;
 ; GFX12-LABEL: test_min_max_ValK0_K1_f32:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mul_f32_e64 v0, v0, 2.0 clamp
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul float %a, 2.0
@@ -29,7 +33,11 @@ define double @test_min_max_K0Val_K1_f64(double %a) #1 {
 ;
 ; GFX12-LABEL: test_min_max_K0Val_K1_f64:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mul_f64_e64 v[0:1], v[0:1], 2.0 clamp
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul double %a, 2.0
@@ -48,7 +56,11 @@ define half @test_min_K1max_ValK0_f16(half %a) #2 {
 ;
 ; GFX12-LABEL: test_min_K1max_ValK0_f16:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mul_f16_e64 v0, v0, 2.0 clamp
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul half %a, 2.0
@@ -66,7 +78,11 @@ define <2 x half> @test_min_K1max_K0Val_f16(<2 x half> %a) #1 {
 ;
 ; GFX12-LABEL: test_min_K1max_K0Val_f16:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0] clamp
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul <2 x half> %a, <half 2.0, half 2.0>
@@ -84,7 +100,11 @@ define <2 x half> @test_min_max_splat_padded_with_undef(<2 x half> %a) #2 {
 ;
 ; GFX12-LABEL: test_min_max_splat_padded_with_undef:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0] clamp
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul <2 x half> %a, <half 2.0, half 2.0>
@@ -104,7 +124,11 @@ define float @test_max_min_ValK1_K0_f32(float %a) #0 {
 ;
 ; GFX12-LABEL: test_max_min_ValK1_K0_f32:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mul_f32_e64 v0, v0, 2.0 clamp
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul float %a, 2.0
@@ -122,7 +146,11 @@ define double @test_max_min_K1Val_K0_f64(double %a) #1 {
 ;
 ; GFX12-LABEL: test_max_min_K1Val_K0_f64:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mul_f64_e64 v[0:1], v[0:1], 2.0 clamp
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul double %a, 2.0
@@ -140,7 +168,11 @@ define half @test_max_K0min_ValK1_f16(half %a) #0 {
 ;
 ; GFX12-LABEL: test_max_K0min_ValK1_f16:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mul_f16_e64 v0, v0, 2.0 clamp
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul half %a, 2.0
@@ -159,7 +191,11 @@ define <2 x half> @test_max_K0min_K1Val_v2f16(<2 x half> %a) #1 {
 ;
 ; GFX12-LABEL: test_max_K0min_K1Val_v2f16:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0] clamp
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul <2 x half> %a, <half 2.0, half 2.0>
@@ -179,7 +215,11 @@ define float @test_min_max_global_nnan(float %a) #3 {
 ;
 ; GFX12-LABEL: test_min_max_global_nnan:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v0, v0, v0 clamp
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %maxnum = call float @llvm.maxnum.f32(float %a, float 0.0)
@@ -196,7 +236,11 @@ define float @test_max_min_global_nnan(float %a) #3 {
 ;
 ; GFX12-LABEL: test_max_min_global_nnan:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v0, v0, v0 clamp
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %minnum = call float @llvm.minnum.f32(float %a, float 1.0)
@@ -219,7 +263,11 @@ define float @test_min_max_K0_gt_K1(float %a) #0 {
 ;
 ; GFX12-LABEL: test_min_max_K0_gt_K1:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maxmin_num_f32 v0, v0, 1.0, 0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %maxnum = call nnan float @llvm.maxnum.f32(float %a, float 1.0)
@@ -238,7 +286,11 @@ define float @test_max_min_K0_gt_K1(float %a) #0 {
 ;
 ; GFX12-LABEL: test_max_min_K0_gt_K1:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minmax_num_f32 v0, v0, 0, 1.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %minnum = call nnan float @llvm.minnum.f32(float %a, float 0.0)
@@ -260,7 +312,11 @@ define float @test_min_max_maybe_NaN_input_ieee_false(float %a) #1 {
 ;
 ; GFX12-LABEL: test_min_max_maybe_NaN_input_ieee_false:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mul_f32_e64 v0, v0, 2.0 clamp
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul float %a, 2.0
@@ -280,7 +336,11 @@ define float @test_min_max_maybe_NaN_input_ieee_true_dx10clamp_false(float %a) #
 ;
 ; GFX12-LABEL: test_min_max_maybe_NaN_input_ieee_true_dx10clamp_false:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mul_f32_e64 v0, v0, 2.0 clamp
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul float %a, 2.0
@@ -302,7 +362,11 @@ define float @test_max_min_maybe_NaN_input_ieee_true(float %a) #0 {
 ;
 ; GFX12-LABEL: test_max_min_maybe_NaN_input_ieee_true:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mul_f32_e32 v0, 2.0, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_minmax_num_f32 v0, v0, 1.0, 0
@@ -324,7 +388,11 @@ define float @test_max_min_maybe_NaN_input_ieee_false(float %a) #1 {
 ;
 ; GFX12-LABEL: test_max_min_maybe_NaN_input_ieee_false:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mul_f32_e32 v0, 2.0, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_minmax_num_f32 v0, v0, 1.0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
index e132f7a0ec757a..b58c3b20986363 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
@@ -20,7 +20,11 @@ define i32 @v_extract_v64i32_varidx(ptr addrspace(1) %ptr, i32 %idx) {
 ;
 ; GFX12-LABEL: v_extract_v64i32_varidx:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v2, 63, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 2, v2
@@ -29,7 +33,7 @@ define i32 @v_extract_v64i32_varidx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %vec = load <64 x i32>, ptr addrspace(1) %ptr
   %elt = extractelement <64 x i32> %vec, i32 %idx
@@ -51,7 +55,11 @@ define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) {
 ;
 ; GFX12-LABEL: v_extract_v128i16_varidx:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0x7f, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
@@ -60,7 +68,7 @@ define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-NEXT:    global_load_u16 v0, v[0:1], off
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %vec = load <128 x i16>, ptr addrspace(1) %ptr
   %elt = extractelement <128 x i16> %vec, i32 %idx
@@ -82,7 +90,11 @@ define i64 @v_extract_v32i64_varidx(ptr addrspace(1) %ptr, i32 %idx) {
 ;
 ; GFX12-LABEL: v_extract_v32i64_varidx:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v2, 31, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
@@ -91,7 +103,7 @@ define i64 @v_extract_v32i64_varidx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-NEXT:    global_load_b64 v[0:1], v[0:1], off
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %vec = load <32 x i64>, ptr addrspace(1) %ptr
   %elt = extractelement <32 x i64> %vec, i32 %idx
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 4603fbcd525c78..7a953c3ad80ad4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -79,17 +79,17 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX12-LABEL: store_load_sindex_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    s_add_co_i32 s0, s0, 4
-; GFX12-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:4 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -168,11 +168,11 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
-; GFX12-NEXT:    scratch_store_b32 v0, v2, off offset:4 th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_store_b32 v0, v2, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    v_add_nc_u32_e32 v1, 4, v1
-; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:124 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -247,16 +247,20 @@ define void @store_load_vindex_foo(i32 %idx) {
 ;
 ; GFX12-LABEL: store_load_vindex_foo:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
 ; GFX12-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_add_nc_u32_e32 v1, s32, v1
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    scratch_store_b32 v1, v2, off th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v0, s32 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b32 v1, v2, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, v0, s32 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -301,7 +305,11 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
 ;
 ; GFX12-LABEL: private_ptr_foo:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0x41200000
 ; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
@@ -391,18 +399,19 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX12-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX12-NEXT:    scratch_load_b32 v2, off, off offset:4 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    scratch_load_b32 v2, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    s_addk_co_i32 s0, 0x104
-; GFX12-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:260 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:260 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
@@ -490,14 +499,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
-; GFX12-NEXT:    scratch_store_b32 v0, v2, off offset:260 th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_store_b32 v0, v2, off offset:260 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    v_add_nc_u32_e32 v1, 0x104, v1
-; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:124 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
@@ -588,18 +597,22 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ;
 ; GFX12-LABEL: store_load_vindex_small_offset_foo:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
 ; GFX12-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX12-NEXT:    s_add_co_i32 s0, s32, 0x100
-; GFX12-NEXT:    scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_add_nc_u32_e32 v1, s0, v1
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    scratch_store_b32 v1, v2, off th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v0, s32 offset:256 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b32 v1, v2, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, v0, s32 offset:256 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
@@ -697,18 +710,19 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX12-LABEL: store_load_sindex_large_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX12-NEXT:    scratch_load_b32 v2, off, off offset:4 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    scratch_load_b32 v2, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    s_addk_co_i32 s0, 0x4004
-; GFX12-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:16388 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:16388 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
@@ -798,14 +812,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
-; GFX12-NEXT:    scratch_store_b32 v0, v2, off offset:16388 th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    scratch_store_b32 v0, v2, off offset:16388 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v1
-; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:124 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
@@ -898,18 +912,22 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ;
 ; GFX12-LABEL: store_load_vindex_large_offset_foo:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
 ; GFX12-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX12-NEXT:    s_add_co_i32 s0, s32, 0x4000
-; GFX12-NEXT:    scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_add_nc_u32_e32 v1, s0, v1
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    scratch_store_b32 v1, v2, off th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v0, s32 offset:16384 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b32 v1, v2, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, v0, s32 offset:16384 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
@@ -988,12 +1006,12 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX12-LABEL: store_load_large_imm_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX12-NEXT:    scratch_store_b32 off, v0, off offset:4 th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_store_b32 off, v1, off offset:16004 th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:16004 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b32 off, v0, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b32 off, v1, off offset:16004 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:16004 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
   %i = alloca [4096 x i32], align 4, addrspace(5)
@@ -1068,14 +1086,18 @@ define void @store_load_large_imm_offset_foo() {
 ;
 ; GFX12-LABEL: store_load_large_imm_offset_foo:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX12-NEXT:    scratch_store_b32 off, v0, s32 th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_store_b32 off, v1, s32 offset:16000 th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, off, s32 offset:16000 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b32 off, v0, s32 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b32 off, v1, s32 offset:16000 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, off, s32 offset:16000 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = alloca [4096 x i32], align 4, addrspace(5)
@@ -1152,14 +1174,14 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 15
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_add_nc_u32_e32 v0, 4, v0
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:1024 th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v0, off offset:1024 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, v0, off offset:1024 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
   %alloca = alloca [32 x i32], align 4, addrspace(5)
@@ -1218,13 +1240,17 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
 ;
 ; GFX12-LABEL: store_load_i64_aligned:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile i64 15, ptr addrspace(5) %arg, align 8
@@ -1278,13 +1304,17 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ;
 ; GFX12-LABEL: store_load_i64_unaligned:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile i64 15, ptr addrspace(5) %arg, align 1
@@ -1354,16 +1384,20 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ;
 ; GFX12-LABEL: store_load_v3i32_unaligned:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s2, 3
 ; GFX12-NEXT:    s_mov_b32 s1, 2
 ; GFX12-NEXT:    s_mov_b32 s0, 1
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-NEXT:    scratch_store_b96 v0, v[1:3], off th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b96 v[0:2], v0, off th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile <3 x i32> <i32 1, i32 2, i32 3>, ptr addrspace(5) %arg, align 1
@@ -1438,17 +1472,21 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ;
 ; GFX12-LABEL: store_load_v4i32_unaligned:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s3, 4
 ; GFX12-NEXT:    s_mov_b32 s2, 3
 ; GFX12-NEXT:    s_mov_b32 s1, 2
 ; GFX12-NEXT:    s_mov_b32 s0, 1
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX12-NEXT:    scratch_store_b128 v0, v[1:4], off th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b128 v[0:3], v0, off th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %arg, align 1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
index 096ca5bc8705f5..75c4cd53e3bfc7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
@@ -18,7 +18,11 @@ define float @test_min_max_ValK0_K1_f32(float %a) #0 {
 ;
 ; GFX12-LABEL: test_min_max_ValK0_K1_f32:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_med3_num_f32 v0, v0, 2.0, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %maxnum = call nnan float @llvm.maxnum.f32(float %a, float 2.0)
@@ -41,7 +45,11 @@ define float @test_min_max_K0Val_K1_f32(float %a) #1 {
 ;
 ; GFX12-LABEL: test_min_max_K0Val_K1_f32:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_med3_num_f32 v0, v0, 2.0, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %maxnum = call nnan float @llvm.maxnum.f32(float 2.0, float %a)
@@ -69,7 +77,11 @@ define half @test_min_K1max_ValK0_f16(half %a) #0 {
 ;
 ; GFX12-LABEL: test_min_K1max_ValK0_f16:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_med3_num_f16 v0, v0, 2.0, 4.0
@@ -95,7 +107,11 @@ define half @test_min_K1max_K0Val_f16(half %a) #1 {
 ;
 ; GFX12-LABEL: test_min_K1max_K0Val_f16:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_med3_num_f16 v0, v0, 2.0, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %maxnum = call nnan half @llvm.maxnum.f16(half 2.0, half %a)
@@ -119,7 +135,11 @@ define float @test_max_min_ValK1_K0_f32(float %a) #0 {
 ;
 ; GFX12-LABEL: test_max_min_ValK1_K0_f32:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_med3_num_f32 v0, v0, 2.0, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %minnum = call nnan float @llvm.minnum.f32(float %a, float 4.0)
@@ -142,7 +162,11 @@ define float @test_max_min_K1Val_K0_f32(float %a) #1 {
 ;
 ; GFX12-LABEL: test_max_min_K1Val_K0_f32:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_med3_num_f32 v0, v0, 2.0, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %minnum = call nnan float @llvm.minnum.f32(float 4.0, float %a)
@@ -166,7 +190,11 @@ define half @test_max_K0min_ValK1_f16(half %a) #0 {
 ;
 ; GFX12-LABEL: test_max_K0min_ValK1_f16:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_med3_num_f16 v0, v0, 2.0, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %minnum = call nnan half @llvm.minnum.f16(half %a, half 4.0)
@@ -190,7 +218,11 @@ define half @test_max_K0min_K1Val_f16(half %a) #1 {
 ;
 ; GFX12-LABEL: test_max_K0min_K1Val_f16:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_med3_num_f16 v0, v0, 2.0, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %minnum = call nnan half @llvm.minnum.f16(half 4.0, half %a)
@@ -215,7 +247,11 @@ define float @test_min_max_global_nnan(float %a) #2 {
 ;
 ; GFX12-LABEL: test_min_max_global_nnan:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_med3_num_f32 v0, v0, 2.0, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %maxnum = call float @llvm.maxnum.f32(float %a, float 2.0)
@@ -238,7 +274,11 @@ define float @test_max_min_global_nnan(float %a) #2 {
 ;
 ; GFX12-LABEL: test_max_min_global_nnan:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_med3_num_f32 v0, v0, 2.0, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %minnum = call float @llvm.minnum.f32(float %a, float 4.0)
@@ -268,7 +308,11 @@ define float @test_min_max_K0_gt_K1(float %a) #0 {
 ;
 ; GFX12-LABEL: test_min_max_K0_gt_K1:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maxmin_num_f32 v0, v0, 4.0, 2.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %maxnum = call nnan float @llvm.maxnum.f32(float %a, float 4.0)
@@ -294,7 +338,11 @@ define float @test_max_min_K0_gt_K1(float %a) #0 {
 ;
 ; GFX12-LABEL: test_max_min_K0_gt_K1:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minmax_num_f32 v0, v0, 2.0, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %minnum = call nnan float @llvm.minnum.f32(float %a, float 2.0)
@@ -320,7 +368,11 @@ define float @test_min_max_non_inline_const(float %a) #0 {
 ;
 ; GFX12-LABEL: test_min_max_non_inline_const:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maxmin_num_f32 v0, v0, 2.0, 0x41000000
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %maxnum = call nnan float @llvm.maxnum.f32(float %a, float 2.0)
@@ -347,7 +399,11 @@ define double @test_min_max_f64(double %a) #0 {
 ;
 ; GFX12-LABEL: test_min_max_f64:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], 2.0, v[0:1]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], 4.0, v[0:1]
@@ -379,7 +435,11 @@ define <2 x half> @test_min_max_v2f16(<2 x half> %a) #0 {
 ;
 ; GFX12-LABEL: test_min_max_v2f16:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, 2.0 op_sel_hi:[1,0]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, 4.0 op_sel_hi:[1,0]
@@ -409,7 +469,11 @@ define float @test_min_max_maybe_NaN_input_ieee_false(float %a) #1 {
 ;
 ; GFX12-LABEL: test_min_max_maybe_NaN_input_ieee_false:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_med3_num_f32 v0, v0, 2.0, 4.0
@@ -438,7 +502,11 @@ define float @test_max_min_maybe_NaN_input_ieee_false(float %a) #1 {
 ;
 ; GFX12-LABEL: test_max_min_maybe_NaN_input_ieee_false:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_minmax_num_f32 v0, v0, 4.0, 2.0
@@ -468,7 +536,11 @@ define float @test_max_min_maybe_NaN_input_ieee_true(float %a) #0 {
 ;
 ; GFX12-LABEL: test_max_min_maybe_NaN_input_ieee_true:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_minmax_num_f32 v0, v0, 4.0, 2.0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
index d76c9e4c1c1346..ade6e55b482bb7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
@@ -21,9 +21,13 @@ define i32 @global_atomic_csub(ptr addrspace(1) %ptr, i32 %data) {
 ;
 ; GFX12-LABEL: global_atomic_csub:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %data)
   ret i32 %ret
@@ -50,9 +54,13 @@ define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) {
 ;
 ; GFX12-LABEL: global_atomic_csub_offset:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
   %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
@@ -76,9 +84,13 @@ define void @global_atomic_csub_nortn(ptr addrspace(1) %ptr, i32 %data) {
 ;
 ; GFX12-LABEL: global_atomic_csub_nortn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %data)
   ret void
@@ -105,9 +117,13 @@ define void @global_atomic_csub_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
 ;
 ; GFX12-LABEL: global_atomic_csub_offset_nortn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
   %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
@@ -145,10 +161,10 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1)
 ; GFX12-LABEL: global_atomic_csub_sgpr_base_offset:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -184,7 +200,7 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspa
 ; GFX12-LABEL: global_atomic_csub_sgpr_base_offset_nortn:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll
index b2eb24272e1e51..a3796197e4c534 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll
@@ -44,7 +44,7 @@ define amdgpu_ps float @atomic_swap_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -92,7 +92,7 @@ define amdgpu_ps float @atomic_add_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -140,7 +140,7 @@ define amdgpu_ps float @atomic_sub_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_sub_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -188,7 +188,7 @@ define amdgpu_ps float @atomic_smin_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_min_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -236,7 +236,7 @@ define amdgpu_ps float @atomic_umin_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_min_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -284,7 +284,7 @@ define amdgpu_ps float @atomic_smax_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_max_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -332,7 +332,7 @@ define amdgpu_ps float @atomic_umax_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_max_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -380,7 +380,7 @@ define amdgpu_ps float @atomic_and_i321d(<8 x i32> inreg %rsrc, i32 %data, i16 %
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -428,7 +428,7 @@ define amdgpu_ps float @atomic_or_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -476,7 +476,7 @@ define amdgpu_ps float @atomic_xor_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -524,7 +524,7 @@ define amdgpu_ps float @atomic_inc_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_inc_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -572,7 +572,7 @@ define amdgpu_ps float @atomic_dec_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_dec_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -620,7 +620,7 @@ define amdgpu_ps float @atomic_cmpswap_i32_1d(<8 x i32> inreg %rsrc, i32 %cmp, i
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i16(i32 %cmp, i32 %swap, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -674,7 +674,7 @@ define amdgpu_ps float @atomic_add_i32_2d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i16(i32 %data, i16 %s, i16 %t, <8 x i32> %rsrc, i32 0, i32 0)
@@ -728,7 +728,7 @@ define amdgpu_ps float @atomic_add_i32_3d(<8 x i32> inreg %rsrc, i32 %data, i16
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.add.3d.i32.i16(i32 %data, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 0, i32 0)
@@ -782,7 +782,7 @@ define amdgpu_ps float @atomic_add_i32_cube(<8 x i32> inreg %rsrc, i32 %data, i1
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.add.cube.i32.i16(i32 %data, i16 %s, i16 %t, i16 %face, <8 x i32> %rsrc, i32 0, i32 0)
@@ -836,7 +836,7 @@ define amdgpu_ps float @atomic_add_i32_1darray(<8 x i32> inreg %rsrc, i32 %data,
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.add.1darray.i32.i16(i32 %data, i16 %s, i16 %slice, <8 x i32> %rsrc, i32 0, i32 0)
@@ -890,7 +890,7 @@ define amdgpu_ps float @atomic_add_i32_2darray(<8 x i32> inreg %rsrc, i32 %data,
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.add.2darray.i32.i16(i32 %data, i16 %s, i16 %t, i16 %slice, <8 x i32> %rsrc, i32 0, i32 0)
@@ -944,7 +944,7 @@ define amdgpu_ps float @atomic_add_i32_2dmsaa(<8 x i32> inreg %rsrc, i32 %data,
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.add.2dmsaa.i32.i16(i32 %data, i16 %s, i16 %t, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1004,7 +1004,7 @@ define amdgpu_ps float @atomic_add_i32_2darraymsaa(<8 x i32> inreg %rsrc, i32 %d
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.add.2darraymsaa.i32.i16(i32 %data, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1052,7 +1052,7 @@ define amdgpu_ps float @atomic_add_i32_1d_slc(<8 x i32> inreg %rsrc, i32 %data,
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 2)
@@ -1100,7 +1100,7 @@ define amdgpu_ps <2 x float> @atomic_swap_i64_1d(<8 x i32> inreg %rsrc, i64 %dat
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1148,7 +1148,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.add.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1196,7 +1196,7 @@ define amdgpu_ps <2 x float> @atomic_sub_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_sub_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.sub.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1244,7 +1244,7 @@ define amdgpu_ps <2 x float> @atomic_smin_i64_1d(<8 x i32> inreg %rsrc, i64 %dat
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_min_int v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.smin.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1292,7 +1292,7 @@ define amdgpu_ps <2 x float> @atomic_umin_i64_1d(<8 x i32> inreg %rsrc, i64 %dat
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_min_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.umin.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1340,7 +1340,7 @@ define amdgpu_ps <2 x float> @atomic_smax_i64_1d(<8 x i32> inreg %rsrc, i64 %dat
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_max_int v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.smax.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1388,7 +1388,7 @@ define amdgpu_ps <2 x float> @atomic_umax_i64_1d(<8 x i32> inreg %rsrc, i64 %dat
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_max_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.umax.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1436,7 +1436,7 @@ define amdgpu_ps <2 x float> @atomic_and_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_and v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.and.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1484,7 +1484,7 @@ define amdgpu_ps <2 x float> @atomic_or_i64_1d(<8 x i32> inreg %rsrc, i64 %data,
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_or v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.or.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1532,7 +1532,7 @@ define amdgpu_ps <2 x float> @atomic_xor_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_xor v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.xor.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1580,7 +1580,7 @@ define amdgpu_ps <2 x float> @atomic_inc_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_inc_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.inc.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1628,7 +1628,7 @@ define amdgpu_ps <2 x float> @atomic_dec_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_dec_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.dec.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1676,7 +1676,7 @@ define amdgpu_ps <2 x float> @atomic_cmpswap_i64_1d(<8 x i32> inreg %rsrc, i64 %
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i16(i64 %cmp, i64 %swap, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1730,7 +1730,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.add.2d.i64.i16(i64 %data, i16 %s, i16 %t, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1784,7 +1784,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_3d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v[0:1], [v2, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.add.3d.i64.i16(i64 %data, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1838,7 +1838,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_cube(<8 x i32> inreg %rsrc, i64 %da
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v[0:1], [v2, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_CUBE th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.add.cube.i64.i16(i64 %data, i16 %s, i16 %t, i16 %face , <8 x i32> %rsrc, i32 0, i32 0)
@@ -1892,7 +1892,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_1darray(<8 x i32> inreg %rsrc, i64
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D_ARRAY th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.add.1darray.i64.i16(i64 %data, i16 %s, i16 %slice, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1946,7 +1946,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2darray(<8 x i32> inreg %rsrc, i64
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v[0:1], [v2, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_ARRAY th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.add.2darray.i64.i16(i64 %data, i16 %s, i16 %t, i16 %slice, <8 x i32> %rsrc, i32 0, i32 0)
@@ -2000,7 +2000,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2dmsaa(<8 x i32> inreg %rsrc, i64 %
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v[0:1], [v2, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.add.2dmsaa.i64.i16(i64 %data, i16 %s, i16 %t, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -2060,7 +2060,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2darraymsaa(<8 x i32> inreg %rsrc,
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v[0:1], [v2, v3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY th:TH_ATOMIC_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.add.2darraymsaa.i64.i16(i64 %data, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -2108,7 +2108,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_1d_slc(<8 x i32> inreg %rsrc, i64 %
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.add.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 2)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll
index 3c82cff00bda3c..221e2fd4f00f73 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll
@@ -90,7 +90,7 @@ define amdgpu_ps float @atomic_swap_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i32
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -181,7 +181,7 @@ define amdgpu_ps float @atomic_add_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i32
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -272,7 +272,7 @@ define amdgpu_ps float @atomic_sub_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i32
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_sub_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -363,7 +363,7 @@ define amdgpu_ps float @atomic_smin_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i32
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_min_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -454,7 +454,7 @@ define amdgpu_ps float @atomic_umin_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i32
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_min_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -545,7 +545,7 @@ define amdgpu_ps float @atomic_smax_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i32
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_max_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -636,7 +636,7 @@ define amdgpu_ps float @atomic_umax_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i32
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_max_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -727,7 +727,7 @@ define amdgpu_ps float @atomic_and_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i32
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -818,7 +818,7 @@ define amdgpu_ps float @atomic_or_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -909,7 +909,7 @@ define amdgpu_ps float @atomic_xor_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i32
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1000,7 +1000,7 @@ define amdgpu_ps float @atomic_inc_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i32
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_inc_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1091,7 +1091,7 @@ define amdgpu_ps float @atomic_dec_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i32
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_dec_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1181,7 +1181,7 @@ define amdgpu_ps float @atomic_cmpswap_i32_1d(<8 x i32> inreg %rsrc, i32 %cmp, i
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1356,7 +1356,7 @@ define amdgpu_ps float @atomic_add_i32_2d(<8 x i32> inreg %rsrc, i32 %data, i32
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1449,7 +1449,7 @@ define amdgpu_ps float @atomic_add_i32_3d(<8 x i32> inreg %rsrc, i32 %data, i32
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.add.3d.i32.i32(i32 %data, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1542,7 +1542,7 @@ define amdgpu_ps float @atomic_add_i32_cube(<8 x i32> inreg %rsrc, i32 %data, i3
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.add.cube.i32.i32(i32 %data, i32 %s, i32 %t, i32 %face, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1634,7 +1634,7 @@ define amdgpu_ps float @atomic_add_i32_1darray(<8 x i32> inreg %rsrc, i32 %data,
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.add.1darray.i32.i32(i32 %data, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1727,7 +1727,7 @@ define amdgpu_ps float @atomic_add_i32_2darray(<8 x i32> inreg %rsrc, i32 %data,
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.add.2darray.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1820,7 +1820,7 @@ define amdgpu_ps float @atomic_add_i32_2dmsaa(<8 x i32> inreg %rsrc, i32 %data,
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.add.2dmsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1914,7 +1914,7 @@ define amdgpu_ps float @atomic_add_i32_2darraymsaa(<8 x i32> inreg %rsrc, i32 %d
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v0, [v1, v2, v3, v4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.add.2darraymsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -2005,7 +2005,7 @@ define amdgpu_ps float @atomic_add_i32_1d_slc(<8 x i32> inreg %rsrc, i32 %data,
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 2)
@@ -2095,7 +2095,7 @@ define amdgpu_ps <2 x float> @atomic_swap_i64_1d(<8 x i32> inreg %rsrc, i64 %dat
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -2185,7 +2185,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.add.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -2275,7 +2275,7 @@ define amdgpu_ps <2 x float> @atomic_sub_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_sub_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.sub.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -2365,7 +2365,7 @@ define amdgpu_ps <2 x float> @atomic_smin_i64_1d(<8 x i32> inreg %rsrc, i64 %dat
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_min_int v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.smin.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -2455,7 +2455,7 @@ define amdgpu_ps <2 x float> @atomic_umin_i64_1d(<8 x i32> inreg %rsrc, i64 %dat
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_min_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.umin.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -2545,7 +2545,7 @@ define amdgpu_ps <2 x float> @atomic_smax_i64_1d(<8 x i32> inreg %rsrc, i64 %dat
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_max_int v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.smax.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -2635,7 +2635,7 @@ define amdgpu_ps <2 x float> @atomic_umax_i64_1d(<8 x i32> inreg %rsrc, i64 %dat
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_max_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.umax.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -2725,7 +2725,7 @@ define amdgpu_ps <2 x float> @atomic_and_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_and v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.and.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -2815,7 +2815,7 @@ define amdgpu_ps <2 x float> @atomic_or_i64_1d(<8 x i32> inreg %rsrc, i64 %data,
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_or v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.or.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -2905,7 +2905,7 @@ define amdgpu_ps <2 x float> @atomic_xor_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_xor v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.xor.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -2995,7 +2995,7 @@ define amdgpu_ps <2 x float> @atomic_inc_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_inc_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.inc.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3085,7 +3085,7 @@ define amdgpu_ps <2 x float> @atomic_dec_i64_1d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_dec_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.dec.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3175,7 +3175,7 @@ define amdgpu_ps <2 x float> @atomic_cmpswap_i64_1d(<8 x i32> inreg %rsrc, i64 %
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 %cmp, i64 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3348,7 +3348,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v[0:1], [v2, v3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.add.2d.i64.i32(i64 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3438,7 +3438,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_3d(<8 x i32> inreg %rsrc, i64 %data
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v[0:1], [v2, v3, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.add.3d.i64.i32(i64 %data, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3528,7 +3528,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_cube(<8 x i32> inreg %rsrc, i64 %da
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v[0:1], [v2, v3, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_CUBE th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.add.cube.i64.i32(i64 %data, i32 %s, i32 %t, i32 %face, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3618,7 +3618,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_1darray(<8 x i32> inreg %rsrc, i64
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v[0:1], [v2, v3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D_ARRAY th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.add.1darray.i64.i32(i64 %data, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3708,7 +3708,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2darray(<8 x i32> inreg %rsrc, i64
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v[0:1], [v2, v3, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_ARRAY th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.add.2darray.i64.i32(i64 %data, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3798,7 +3798,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2dmsaa(<8 x i32> inreg %rsrc, i64 %
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v[0:1], [v2, v3, v4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.add.2dmsaa.i64.i32(i64 %data, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3888,7 +3888,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2darraymsaa(<8 x i32> inreg %rsrc,
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v[0:1], [v2, v3, v4, v5], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.add.2darraymsaa.i64.i32(i64 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3978,7 +3978,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_1d_slc(<8 x i32> inreg %rsrc, i64 %
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_atomic_add_uint v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.add.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 2)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll
index b3b143372481e6..916b9c0835d41c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll
@@ -71,7 +71,7 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX12-NEXT:    image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -145,7 +145,7 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX12-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX12-NEXT:    image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f16(i32 1, half %s, half %t, half %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -219,7 +219,7 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX12-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX12-NEXT:    image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f16(i32 1, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -293,7 +293,7 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX12-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX12-NEXT:    image_gather4_c v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f16(i32 1, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -367,7 +367,7 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX12-NEXT:    image_gather4_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f16(i32 1, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -443,7 +443,7 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX12-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX12-NEXT:    image_gather4_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.2d.v4f32.f16(i32 1, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -517,7 +517,7 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX12-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX12-NEXT:    image_gather4_b v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f16.f16(i32 1, half %bias, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -591,7 +591,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX12-NEXT:    image_gather4_c_b v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f16.f16(i32 1, half %bias, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -667,7 +667,7 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX12-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX12-NEXT:    image_gather4_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f16.f16(i32 1, half %bias, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -743,7 +743,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX12-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX12-NEXT:    image_gather4_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f16.f16(i32 1, half %bias, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -808,7 +808,7 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
 ; GFX12-NEXT:    image_gather4_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f16(i32 1, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -875,7 +875,7 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
 ; GFX12-NEXT:    image_gather4_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f16(i32 1, float %zcompare, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -940,7 +940,7 @@ define amdgpu_ps <4 x float> @gather4_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
 ; GFX12-NEXT:    image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -1005,7 +1005,7 @@ define amdgpu_ps <4 x float> @gather4_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
 ; GFX12-NEXT:    image_gather4_c_lz v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f16(i32 1, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
index e16c4a66e69914..841f4f1ac055ee 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
@@ -65,7 +65,7 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
 ; GFX12-NEXT:    image_gather4 v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -154,7 +154,7 @@ define amdgpu_ps <4 x float> @gather4_2d_tfe(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX12-NEXT:    image_gather4 v[0:4], [v5, v6], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call { <4 x float>, i32 } @llvm.amdgcn.image.gather4.2d.sl_v4f32i32s.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
@@ -223,7 +223,7 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
 ; GFX12-NEXT:    image_gather4 v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f32(i32 1, float %s, float %t, float %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -291,7 +291,7 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
 ; GFX12-NEXT:    image_gather4 v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f32(i32 1, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -359,7 +359,7 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
 ; GFX12-NEXT:    image_gather4_c v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -427,7 +427,7 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
 ; GFX12-NEXT:    image_gather4_cl v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 1, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -495,7 +495,7 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
 ; GFX12-NEXT:    image_gather4_c_cl v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -563,7 +563,7 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
 ; GFX12-NEXT:    image_gather4_b v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -631,7 +631,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
 ; GFX12-NEXT:    image_gather4_c_b v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -699,7 +699,7 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
 ; GFX12-NEXT:    image_gather4_b_cl v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -767,7 +767,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
 ; GFX12-NEXT:    image_gather4_c_b_cl v[0:3], [v0, v1, v2, v[3:4]], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -826,7 +826,7 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
 ; GFX12-NEXT:    image_gather4_l v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 1, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -885,7 +885,7 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
 ; GFX12-NEXT:    image_gather4_c_l v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -944,7 +944,7 @@ define amdgpu_ps <4 x float> @gather4_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
 ; GFX12-NEXT:    image_gather4_lz v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -1003,7 +1003,7 @@ define amdgpu_ps <4 x float> @gather4_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
 ; GFX12-NEXT:    image_gather4_c_lz v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -1071,7 +1071,7 @@ define amdgpu_ps <4 x float> @gather4_2d_dmask_2(<8 x i32> inreg %rsrc, <4 x i32
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
 ; GFX12-NEXT:    image_gather4 v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x2 dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 2, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -1139,7 +1139,7 @@ define amdgpu_ps <4 x float> @gather4_2d_dmask_4(<8 x i32> inreg %rsrc, <4 x i32
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
 ; GFX12-NEXT:    image_gather4 v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 4, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -1207,7 +1207,7 @@ define amdgpu_ps <4 x float> @gather4_2d_dmask_8(<8 x i32> inreg %rsrc, <4 x i32
 ; GFX12-NEXT:    s_mov_b32 s10, s12
 ; GFX12-NEXT:    s_mov_b32 s11, s13
 ; GFX12-NEXT:    image_gather4 v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x8 dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 8, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll
index 0b6d66b02afd9e..d1a36d552e21b5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll
@@ -44,7 +44,7 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, i16 %mip) {
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -91,7 +91,7 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, i16 %mip) {
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2d.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -138,7 +138,7 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, i16 %mip) {
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.3d.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -185,7 +185,7 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, i16 %mip) {
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.cube.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -232,7 +232,7 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, i16 %mip
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1darray.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -279,7 +279,7 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, i16 %mip
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darray.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -326,7 +326,7 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, i16 %mip)
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2dmsaa.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -373,7 +373,7 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, i16
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darraymsaa.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll
index 23be1ca05d948a..496f9f428580b6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll
@@ -59,7 +59,7 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, i32 %mip) {
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -120,7 +120,7 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, i32 %mip) {
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -181,7 +181,7 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, i32 %mip) {
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.3d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -242,7 +242,7 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, i32 %mip) {
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.cube.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -303,7 +303,7 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, i32 %mip
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1darray.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -364,7 +364,7 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, i32 %mip
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darray.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -425,7 +425,7 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, i32 %mip)
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2dmsaa.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -486,7 +486,7 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, i32
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darraymsaa.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -547,7 +547,7 @@ define amdgpu_ps <3 x float> @getresinfo_dmask7(<8 x i32> inreg %rsrc, <4 x floa
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_get_resinfo v[0:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %r = call <3 x float> @llvm.amdgcn.image.getresinfo.1d.v3f32.i32(i32 7, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -608,7 +608,7 @@ define amdgpu_ps <2 x float> @getresinfo_dmask3(<8 x i32> inreg %rsrc, <4 x floa
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_get_resinfo v[0:1], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %r = call <2 x float> @llvm.amdgcn.image.getresinfo.1d.v2f32.i32(i32 3, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -669,7 +669,7 @@ define amdgpu_ps float @getresinfo_dmask1(<8 x i32> inreg %rsrc, <4 x float> %vd
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_get_resinfo v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %r = call float @llvm.amdgcn.image.getresinfo.1d.f32.i32(i32 1, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
index d3434e922752d9..19b0057d69b69d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
@@ -74,7 +74,7 @@ define amdgpu_ps half @load_1d_f16_x(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D d16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call half @llvm.amdgcn.image.load.1d.half.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret half %v
@@ -148,7 +148,7 @@ define amdgpu_ps half @load_1d_f16_y(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_load v0, v0, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D d16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call half @llvm.amdgcn.image.load.1d.half.i32(i32 2, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret half %v
@@ -222,7 +222,7 @@ define amdgpu_ps half @load_1d_f16_z(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_load v0, v0, s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_1D d16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call half @llvm.amdgcn.image.load.1d.half.i32(i32 4, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret half %v
@@ -296,7 +296,7 @@ define amdgpu_ps half @load_1d_f16_w(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_load v0, v0, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D d16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call half @llvm.amdgcn.image.load.1d.half.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret half %v
@@ -373,7 +373,7 @@ define amdgpu_ps <2 x half> @load_1d_v2f16_xy(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_load v0, v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D d16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call <2 x half> @llvm.amdgcn.image.load.1d.v2f16.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret <2 x half> %v
@@ -450,7 +450,7 @@ define amdgpu_ps <2 x half> @load_1d_v2f16_xz(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_load v0, v0, s[0:7] dmask:0x5 dim:SQ_RSRC_IMG_1D d16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call <2 x half> @llvm.amdgcn.image.load.1d.v2f16.i32(i32 5, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret <2 x half> %v
@@ -527,7 +527,7 @@ define amdgpu_ps <2 x half> @load_1d_v2f16_xw(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_load v0, v0, s[0:7] dmask:0x9 dim:SQ_RSRC_IMG_1D d16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call <2 x half> @llvm.amdgcn.image.load.1d.v2f16.i32(i32 9, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret <2 x half> %v
@@ -604,7 +604,7 @@ define amdgpu_ps <2 x half> @load_1d_v2f16_yz(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_load v0, v0, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D d16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call <2 x half> @llvm.amdgcn.image.load.1d.v2f16.i32(i32 6, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret <2 x half> %v
@@ -683,7 +683,7 @@ define amdgpu_ps <3 x half> @load_1d_v3f16_xyz(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_load v[0:1], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D d16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call <3 x half> @llvm.amdgcn.image.load.1d.v3f16.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret <3 x half> %v
@@ -763,7 +763,7 @@ define amdgpu_ps <4 x half> @load_1d_v4f16_xyzw(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_load v[0:1], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D d16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret <4 x half> %v
@@ -851,7 +851,7 @@ define amdgpu_ps float @load_1d_f16_tfe_dmask_x(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX12-NEXT:    image_load v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D tfe d16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX12-NEXT:    ; return to shader part epilog
@@ -944,7 +944,7 @@ define amdgpu_ps float @load_1d_v2f16_tfe_dmask_xy(<8 x i32> inreg %rsrc, i32 %s
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX12-NEXT:    image_load v[1:2], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe d16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX12-NEXT:    ; return to shader part epilog
@@ -1041,7 +1041,7 @@ define amdgpu_ps float @load_1d_v3f16_tfe_dmask_xyz(<8 x i32> inreg %rsrc, i32 %
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v3, v1
 ; GFX12-NEXT:    image_load v[1:3], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D tfe d16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    ; return to shader part epilog
@@ -1133,7 +1133,7 @@ define amdgpu_ps float @load_1d_v4f16_tfe_dmask_xyzw(<8 x i32> inreg %rsrc, i32
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX12-NEXT:    image_load v[1:2], v0, s[0:7] dmask:0x10 dim:SQ_RSRC_IMG_1D tfe d16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX12-NEXT:    ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll
index 1cdcfba3d5fc3a..ecf81f633761d7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll
@@ -60,7 +60,7 @@ define amdgpu_ps float @load_1d_f32_x(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret float %v
@@ -120,7 +120,7 @@ define amdgpu_ps float @load_1d_f32_y(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_load v0, v0, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 2, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret float %v
@@ -180,7 +180,7 @@ define amdgpu_ps float @load_1d_f32_z(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_load v0, v0, s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 4, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret float %v
@@ -240,7 +240,7 @@ define amdgpu_ps float @load_1d_f32_w(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_load v0, v0, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret float %v
@@ -300,7 +300,7 @@ define amdgpu_ps <2 x float> @load_1d_v2f32_xy(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_load v[0:1], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret <2 x float> %v
@@ -360,7 +360,7 @@ define amdgpu_ps <2 x float> @load_1d_v2f32_xz(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_load v[0:1], v0, s[0:7] dmask:0x5 dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 5, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret <2 x float> %v
@@ -420,7 +420,7 @@ define amdgpu_ps <2 x float> @load_1d_v2f32_xw(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_load v[0:1], v0, s[0:7] dmask:0x9 dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 9, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret <2 x float> %v
@@ -480,7 +480,7 @@ define amdgpu_ps <2 x float> @load_1d_v2f32_yz(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_load v[0:1], v0, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 6, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret <2 x float> %v
@@ -540,7 +540,7 @@ define amdgpu_ps <3 x float> @load_1d_v3f32_xyz(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_load v[0:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call <3 x float> @llvm.amdgcn.image.load.1d.v3f32.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret <3 x float> %v
@@ -600,7 +600,7 @@ define amdgpu_ps <4 x float> @load_1d_v4f32_xyzw(<8 x i32> inreg %rsrc, i32 %s)
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret <4 x float> %v
@@ -670,7 +670,7 @@ define amdgpu_ps float @load_1d_f32_tfe_dmask_x(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX12-NEXT:    image_load v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX12-NEXT:    ; return to shader part epilog
@@ -746,7 +746,7 @@ define amdgpu_ps float @load_1d_v2f32_tfe_dmask_xy(<8 x i32> inreg %rsrc, i32 %s
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v3, v1
 ; GFX12-NEXT:    image_load v[1:3], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    ; return to shader part epilog
@@ -825,7 +825,7 @@ define amdgpu_ps float @load_1d_v3f32_tfe_dmask_xyz(<8 x i32> inreg %rsrc, i32 %
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v3, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX12-NEXT:    image_load v[1:4], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX12-NEXT:    ; return to shader part epilog
@@ -899,7 +899,7 @@ define amdgpu_ps float @load_1d_v4f32_tfe_dmask_xyzw(<8 x i32> inreg %rsrc, i32
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX12-NEXT:    image_load v[1:2], v0, s[0:7] dmask:0x10 dim:SQ_RSRC_IMG_1D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX12-NEXT:    ; return to shader part epilog
@@ -973,7 +973,7 @@ define amdgpu_ps float @load_1d_f32_tfe_dmask_0(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX12-NEXT:    image_load v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX12-NEXT:    ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll
index e1f571d55be23b..fb4c92353cb99d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll
@@ -44,7 +44,7 @@ define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw(<8 x i32> inreg %rsrc, i32 %s,
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
   ret <4 x float> %v
@@ -149,7 +149,7 @@ define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v11
 ; GFX12-NEXT:    image_load v[0:4], [v5, v6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v7, v4, s[10:11]
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
@@ -258,7 +258,7 @@ define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v11
 ; GFX12-NEXT:    image_load v[0:4], [v5, v6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v7, v4, s[10:11]
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 3, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll
index 4eb8c56382f8c8..2c8b8126aa09a4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll
@@ -56,7 +56,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw(<8 x i32> inreg %rsrc,
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
   ret <4 x float> %v
@@ -176,7 +176,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX12-NEXT:    image_load v[0:4], [v10, v11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY a16 tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v5, v4, s[10:11]
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
@@ -300,7 +300,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX12-NEXT:    image_load v[0:4], [v10, v11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY a16 tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v5, v4, s[10:11]
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 3, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll
index c7c216767b93bc..11ad98ac5fd1a4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll
@@ -44,7 +44,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw(<8 x i32> inreg %rsrc,
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_load v[0:3], [v0, v1, v2, v3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
   ret <4 x float> %v
@@ -155,7 +155,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v13
 ; GFX12-NEXT:    image_load v[0:4], [v5, v6, v7, v8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v9, v4, s[10:11]
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
@@ -270,7 +270,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v13
 ; GFX12-NEXT:    image_load v[0:4], [v5, v6, v7, v8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v9, v4, s[10:11]
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
index ff93d1119df83e..f5d11fcdff80a7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
@@ -50,7 +50,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s,
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_load v[0:3], [v0, v2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 0, i32 0)
   ret <4 x float> %v
@@ -158,7 +158,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v10
 ; GFX12-NEXT:    image_load v[0:4], [v11, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v6, v4, s[10:11]
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 1, i32 0)
@@ -270,7 +270,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v10
 ; GFX12-NEXT:    image_load v[0:4], [v11, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v6, v4, s[10:11]
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 3, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll
index 1285bb79e017f0..162a58613065f0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll
@@ -44,7 +44,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i32 %s,
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_load v[0:3], [v0, v1, v2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
   ret <4 x float> %v
@@ -151,7 +151,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v12
 ; GFX12-NEXT:    image_load v[0:4], [v5, v6, v7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v8, v4, s[10:11]
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 1, i32 0)
@@ -262,7 +262,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v12
 ; GFX12-NEXT:    image_load v[0:4], [v5, v6, v7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v8, v4, s[10:11]
 ; GFX12-NEXT:    ; return to shader part epilog
   %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
index 2b82d891afaf03..268d416d2020b8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
@@ -19,7 +19,7 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-LABEL: sample_d_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_d_g16 v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -54,7 +54,7 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX12-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
 ; GFX12-NEXT:    image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -91,7 +91,7 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX12-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
 ; GFX12-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v1, v[5:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -114,7 +114,7 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12-LABEL: sample_c_d_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -149,7 +149,7 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX12-NEXT:    v_lshl_or_b32 v2, v4, 16, v3
 ; GFX12-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v2, v[5:6]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -172,7 +172,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-LABEL: sample_d_cl_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -207,7 +207,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX12-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
 ; GFX12-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v[5:6]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -230,7 +230,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX12-LABEL: sample_c_d_cl_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v[3:4]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -267,7 +267,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX12-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX12-NEXT:    v_lshl_or_b32 v2, v4, 16, v3
 ; GFX12-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v[5:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -306,7 +306,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX12-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX12-NEXT:    v_lshl_or_b32 v5, v5, 16, v4
 ; GFX12-NEXT:    image_sample_c_d_o_g16 v0, [v0, v1, v2, v[5:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -345,7 +345,7 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
 ; GFX12-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 ; GFX12-NEXT:    v_lshl_or_b32 v5, v5, 16, v4
 ; GFX12-NEXT:    image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v[5:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
index f495ce5390c727..0d25ee527ee80a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
@@ -20,7 +20,11 @@ define float @v_rsq_clamp_f32(float %src) #0 {
 ;
 ; GFX12-LABEL: v_rsq_clamp_f32:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_rsq_f32_e32 v0, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0xff7fffff
 ; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
@@ -47,7 +51,11 @@ define float @v_rsq_clamp_fabs_f32(float %src) #0 {
 ;
 ; GFX12-LABEL: v_rsq_clamp_fabs_f32:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_rsq_f32_e64 v0, |v0|
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0xff7fffff
 ; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
@@ -78,7 +86,11 @@ define double @v_rsq_clamp_f64(double %src) #0 {
 ;
 ; GFX12-LABEL: v_rsq_clamp_f64:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
 ; GFX12-NEXT:    s_mov_b32 s0, -1
 ; GFX12-NEXT:    s_mov_b32 s1, 0x7fefffff
@@ -112,7 +124,11 @@ define double @v_rsq_clamp_fabs_f64(double %src) #0 {
 ;
 ; GFX12-LABEL: v_rsq_clamp_fabs_f64:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_rsq_f64_e64 v[0:1], |v[0:1]|
 ; GFX12-NEXT:    s_mov_b32 s0, -1
 ; GFX12-NEXT:    s_mov_b32 s1, 0x7fefffff
@@ -144,7 +160,11 @@ define float @v_rsq_clamp_undef_f32() #0 {
 ;
 ; GFX12-LABEL: v_rsq_clamp_undef_f32:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_s_rsq_f32 s0, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0xff7fffff
 ; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
@@ -174,7 +194,11 @@ define double @v_rsq_clamp_undef_f64() #0 {
 ;
 ; GFX12-LABEL: v_rsq_clamp_undef_f64:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_rsq_f64_e32 v[0:1], s[0:1]
 ; GFX12-NEXT:    s_mov_b32 s0, -1
 ; GFX12-NEXT:    s_mov_b32 s1, 0x7fefffff
@@ -205,7 +229,11 @@ define float @v_rsq_clamp_f32_non_ieee(float %src) #2 {
 ;
 ; GFX12-LABEL: v_rsq_clamp_f32_non_ieee:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_rsq_f32_e32 v0, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0xff7fffff
 ; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
@@ -235,7 +263,11 @@ define double @v_rsq_clamp_f64_non_ieee(double %src) #2 {
 ;
 ; GFX12-LABEL: v_rsq_clamp_f64_non_ieee:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
 ; GFX12-NEXT:    s_mov_b32 s0, -1
 ; GFX12-NEXT:    s_mov_b32 s1, 0x7fefffff
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
index 4853bb309c1bb6..13fb5b7b9bfd3e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
@@ -11,14 +11,22 @@
 define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
 ; GFX12-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
 ; GFX12-UNALIGNED:       ; %bb.0:
-; GFX12-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-UNALIGNED-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-UNALIGNED-NEXT:    s_wait_expcnt 0x0
+; GFX12-UNALIGNED-NEXT:    s_wait_samplecnt 0x0
+; GFX12-UNALIGNED-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-UNALIGNED-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-UNALIGNED-NEXT:    global_load_b96 v[0:2], v[0:1], off
-; GFX12-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-UNALIGNED-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1:
 ; GFX12-NOUNALIGNED:       ; %bb.0:
-; GFX12-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NOUNALIGNED-NEXT:    s_wait_expcnt 0x0
+; GFX12-NOUNALIGNED-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NOUNALIGNED-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NOUNALIGNED-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NOUNALIGNED-NEXT:    s_clause 0xb
 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v2, v[0:1], off
 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v3, v[0:1], off offset:1
@@ -32,23 +40,23 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v11, v[0:1], off offset:9
 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v12, v[0:1], off offset:11
 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v0, v[0:1], off offset:10
-; GFX12-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(10)
+; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0xa
 ; GFX12-NOUNALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 8, v2
-; GFX12-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(9)
+; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x9
 ; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(8)
+; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x8
 ; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 24, v5
-; GFX12-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x6
 ; GFX12-NOUNALIGNED-NEXT:    v_lshl_or_b32 v4, v7, 8, v6
-; GFX12-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x5
 ; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX12-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x4
 ; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 24, v9
-; GFX12-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x2
 ; GFX12-NOUNALIGNED-NEXT:    v_lshl_or_b32 v7, v11, 8, v10
-; GFX12-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x1
 ; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v8, 24, v12
-; GFX12-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX12-NOUNALIGNED-NEXT:    v_or3_b32 v0, v2, v3, v1
 ; GFX12-NOUNALIGNED-NEXT:    v_or3_b32 v1, v5, v6, v4
@@ -217,14 +225,22 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
 define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) {
 ; GFX12-UNALIGNED-LABEL: v_load_constant_v3i32_align2:
 ; GFX12-UNALIGNED:       ; %bb.0:
-; GFX12-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-UNALIGNED-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-UNALIGNED-NEXT:    s_wait_expcnt 0x0
+; GFX12-UNALIGNED-NEXT:    s_wait_samplecnt 0x0
+; GFX12-UNALIGNED-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-UNALIGNED-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-UNALIGNED-NEXT:    global_load_b96 v[0:2], v[0:1], off
-; GFX12-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-UNALIGNED-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2:
 ; GFX12-NOUNALIGNED:       ; %bb.0:
-; GFX12-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NOUNALIGNED-NEXT:    s_wait_expcnt 0x0
+; GFX12-NOUNALIGNED-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NOUNALIGNED-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NOUNALIGNED-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NOUNALIGNED-NEXT:    s_clause 0x5
 ; GFX12-NOUNALIGNED-NEXT:    global_load_u16 v2, v[0:1], off
 ; GFX12-NOUNALIGNED-NEXT:    global_load_u16 v3, v[0:1], off offset:2
@@ -232,11 +248,11 @@ define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) {
 ; GFX12-NOUNALIGNED-NEXT:    global_load_u16 v5, v[0:1], off offset:6
 ; GFX12-NOUNALIGNED-NEXT:    global_load_u16 v6, v[0:1], off offset:8
 ; GFX12-NOUNALIGNED-NEXT:    global_load_u16 v7, v[0:1], off offset:10
-; GFX12-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x4
 ; GFX12-NOUNALIGNED-NEXT:    v_lshl_or_b32 v0, v3, 16, v2
-; GFX12-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x2
 ; GFX12-NOUNALIGNED-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
-; GFX12-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NOUNALIGNED-NEXT:    v_lshl_or_b32 v2, v7, 16, v6
 ; GFX12-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -332,9 +348,13 @@ define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) {
 define <3 x i32> @v_load_constant_v3i32_align4(ptr addrspace(4) %ptr) {
 ; GFX12-LABEL: v_load_constant_v3i32_align4:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b96 v[0:2], v[0:1], off
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_load_constant_v3i32_align4:
@@ -374,9 +394,13 @@ define <3 x i32> @v_load_constant_v3i32_align4(ptr addrspace(4) %ptr) {
 define i96 @v_load_constant_i96_align8(ptr addrspace(4) %ptr) {
 ; GFX12-LABEL: v_load_constant_i96_align8:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b96 v[0:2], v[0:1], off
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_load_constant_i96_align8:
@@ -416,9 +440,13 @@ define i96 @v_load_constant_i96_align8(ptr addrspace(4) %ptr) {
 define <3 x i32> @v_load_constant_v3i32_align8(ptr addrspace(4) %ptr) {
 ; GFX12-LABEL: v_load_constant_v3i32_align8:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b96 v[0:2], v[0:1], off
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_load_constant_v3i32_align8:
@@ -458,9 +486,13 @@ define <3 x i32> @v_load_constant_v3i32_align8(ptr addrspace(4) %ptr) {
 define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) {
 ; GFX12-LABEL: v_load_constant_v6i16_align8:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b96 v[0:2], v[0:1], off
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_load_constant_v6i16_align8:
@@ -509,9 +541,13 @@ define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) {
 define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) {
 ; GFX12-LABEL: v_load_constant_v12i8_align8:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b96 v[0:2], v[0:1], off
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v13, 8, v0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
@@ -598,9 +634,13 @@ define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) {
 define <3 x i32> @v_load_constant_v3i32_align16(ptr addrspace(4) %ptr) {
 ; GFX12-LABEL: v_load_constant_v3i32_align16:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b96 v[0:2], v[0:1], off
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_load_constant_v3i32_align16:
@@ -638,7 +678,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg
 ; GFX12-UNALIGNED:       ; %bb.0:
 ; GFX12-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-UNALIGNED-NEXT:    global_load_b96 v[0:2], v0, s[0:1]
-; GFX12-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-UNALIGNED-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-UNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX12-UNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX12-UNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
@@ -660,24 +700,24 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg
 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v10, v0, s[0:1] offset:9
 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v11, v0, s[0:1] offset:11
 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v0, v0, s[0:1] offset:10
-; GFX12-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(10)
+; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0xa
 ; GFX12-NOUNALIGNED-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
-; GFX12-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(9)
+; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x9
 ; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(8)
+; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x8
 ; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
-; GFX12-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x6
 ; GFX12-NOUNALIGNED-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
-; GFX12-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x5
 ; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX12-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x4
 ; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 24, v8
 ; GFX12-NOUNALIGNED-NEXT:    v_or3_b32 v1, v2, v3, v1
-; GFX12-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x2
 ; GFX12-NOUNALIGNED-NEXT:    v_lshl_or_b32 v7, v10, 8, v9
-; GFX12-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x1
 ; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v8, 24, v11
-; GFX12-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX12-NOUNALIGNED-NEXT:    v_or3_b32 v2, v5, v6, v4
 ; GFX12-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s0, v1
@@ -860,7 +900,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg
 ; GFX12-UNALIGNED:       ; %bb.0:
 ; GFX12-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-UNALIGNED-NEXT:    global_load_b96 v[0:2], v0, s[0:1]
-; GFX12-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-UNALIGNED-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-UNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX12-UNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX12-UNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
@@ -876,11 +916,11 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg
 ; GFX12-NOUNALIGNED-NEXT:    global_load_u16 v4, v0, s[0:1] offset:6
 ; GFX12-NOUNALIGNED-NEXT:    global_load_u16 v5, v0, s[0:1] offset:8
 ; GFX12-NOUNALIGNED-NEXT:    global_load_u16 v0, v0, s[0:1] offset:10
-; GFX12-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x4
 ; GFX12-NOUNALIGNED-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
-; GFX12-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x2
 ; GFX12-NOUNALIGNED-NEXT:    v_lshl_or_b32 v2, v4, 16, v3
-; GFX12-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NOUNALIGNED-NEXT:    v_lshl_or_b32 v0, v0, 16, v5
 ; GFX12-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX12-NOUNALIGNED-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -990,7 +1030,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg
 ; GFX12-LABEL: s_load_constant_v3i32_align4:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_load_constant_v3i32_align4:
@@ -1027,7 +1067,7 @@ define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) {
 ; GFX12-LABEL: s_load_constant_i96_align8:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_load_constant_i96_align8:
@@ -1064,7 +1104,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg
 ; GFX12-LABEL: s_load_constant_v3i32_align8:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_load_constant_v3i32_align8:
@@ -1101,7 +1141,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg
 ; GFX12-LABEL: s_load_constant_v6i16_align8:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_load_constant_v6i16_align8:
@@ -1139,7 +1179,7 @@ define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg
 ; GFX12-LABEL: s_load_constant_v12i8_align8:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshr_b32 s13, s0, 8
 ; GFX12-NEXT:    s_lshr_b32 s12, s0, 16
 ; GFX12-NEXT:    s_lshr_b32 s3, s0, 24
@@ -1216,7 +1256,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align16(ptr addrspace(4) inreg
 ; GFX12-LABEL: s_load_constant_v3i32_align16:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 ;
 ; GCN-LABEL: s_load_constant_v3i32_align16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
index 904120e7d11894..a9f0e546eb35b6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
@@ -671,8 +671,8 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr(ptr addrspace(1) inreg %ptr) {
 ; GFX12-LABEL: mubuf_load_sgpr_ptr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %val = load volatile float, ptr addrspace(1) %ptr
   ret float %val
@@ -704,8 +704,8 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095(ptr addrspace(1) inreg %p
 ; GFX12-LABEL: mubuf_load_sgpr_ptr_offset4095:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] offset:16380 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] offset:16380 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 4095
   %val = load volatile float, ptr addrspace(1) %gep
@@ -747,8 +747,8 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967296(ptr addrspace(1) in
 ; GFX12-NEXT:    s_add_co_ci_u32 s1, s3, 4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    global_load_b32 v0, v[0:1], off th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_load_b32 v0, v[0:1], off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 4294967296
   %val = load volatile float, ptr addrspace(1) %gep
@@ -790,8 +790,8 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967297(ptr addrspace(1) in
 ; GFX12-NEXT:    s_add_co_ci_u32 s1, s3, 4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    global_load_b32 v0, v[0:1], off th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_load_b32 v0, v[0:1], off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 4294967297
   %val = load volatile float, ptr addrspace(1) %gep
@@ -824,8 +824,8 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4096(ptr addrspace(1) inreg %p
 ; GFX12-LABEL: mubuf_load_sgpr_ptr_offset4096:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] offset:16384 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] offset:16384 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 4096
   %val = load volatile float, ptr addrspace(1) %gep
@@ -855,8 +855,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4095(ptr addrspace(1) %ptr) {
 ;
 ; GFX12-LABEL: mubuf_load_vgpr_ptr_offset4095:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_load_b32 v0, v[0:1], off offset:16380 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_load_b32 v0, v[0:1], off offset:16380 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 4095
   %val = load volatile float, ptr addrspace(1) %gep
@@ -893,8 +893,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967296(ptr addrspace(1) %p
 ; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX12-NEXT:    global_load_b32 v0, v[0:1], off th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_load_b32 v0, v[0:1], off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 4294967296
   %val = load volatile float, ptr addrspace(1) %gep
@@ -931,8 +931,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967297(ptr addrspace(1) %p
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX12-NEXT:    global_load_b32 v0, v[0:1], off th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_load_b32 v0, v[0:1], off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 4294967297
   %val = load volatile float, ptr addrspace(1) %gep
@@ -962,8 +962,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4096(ptr addrspace(1) %ptr) {
 ;
 ; GFX12-LABEL: mubuf_load_vgpr_ptr_offset4096:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_load_b32 v0, v[0:1], off offset:16384 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_load_b32 v0, v[0:1], off offset:16384 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 4096
   %val = load volatile float, ptr addrspace(1) %gep
@@ -1007,8 +1007,8 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_sgpr_offset(ptr addrspace(1) inreg %
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_co_u32 s0, s2, s0
 ; GFX12-NEXT:    s_add_co_ci_u32 s1, s3, s1
-; GFX12-NEXT:    global_load_b32 v0, v0, s[0:1] th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_load_b32 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr float, ptr addrspace(1) %ptr, i32 %soffset
   %val = load volatile float, ptr addrspace(1) %gep
@@ -1045,8 +1045,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset(ptr addrspace(1) %ptr, i
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX12-NEXT:    global_load_b32 v0, v[0:1], off th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_load_b32 v0, v[0:1], off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr float, ptr addrspace(1) %ptr, i32 %soffset
   %val = load volatile float, ptr addrspace(1) %gep
@@ -1083,8 +1083,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset_offset256(ptr addrspace(
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX12-NEXT:    global_load_b32 v0, v[0:1], off offset:1024 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_load_b32 v0, v[0:1], off offset:1024 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr float, ptr addrspace(1) %ptr, i32 %soffset
   %gep1 = getelementptr float, ptr addrspace(1) %gep0, i32 256
@@ -1122,8 +1122,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset256_offset(ptr addrspace(
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX12-NEXT:    global_load_b32 v0, v[0:1], off offset:1024 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_load_b32 v0, v[0:1], off offset:1024 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 256
   %gep1 = getelementptr float, ptr addrspace(1) %gep0, i32 %soffset
@@ -1165,8 +1165,8 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_vgpr_offset(ptr addrspace(1) inreg %
 ; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
-; GFX12-NEXT:    global_load_b32 v0, v[0:1], off th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_load_b32 v0, v[0:1], off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr float, ptr addrspace(1) %ptr, i32 %voffset
   %val = load volatile float, ptr addrspace(1) %gep
@@ -1209,8 +1209,8 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_vgpr_offset_offset4095(ptr addrspace
 ; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
-; GFX12-NEXT:    global_load_b32 v0, v[0:1], off offset:16380 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_load_b32 v0, v[0:1], off offset:16380 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr float, ptr addrspace(1) %ptr, i32 %voffset
   %gep1 = getelementptr float, ptr addrspace(1) %gep0, i64 4095
@@ -1253,8 +1253,8 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095_vgpr_offset(ptr addrspace
 ; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
-; GFX12-NEXT:    global_load_b32 v0, v[0:1], off offset:16380 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    global_load_b32 v0, v[0:1], off offset:16380 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 4095
   %gep1 = getelementptr float, ptr addrspace(1) %gep0, i32 %voffset
@@ -1294,7 +1294,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4095(ptr addrspace(1) inr
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 2 :: v_dual_mov_b32 v1, 0
 ; GFX12-NEXT:    global_atomic_add_u32 v0, v1, v0, s[2:3] offset:16380 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
@@ -1345,7 +1345,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4294967296(ptr addrspace(
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX12-NEXT:    global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
@@ -1386,7 +1386,7 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4095(ptr addrspace(1) %pt
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX12-NEXT:    global_atomic_add_u32 v0, v[0:1], v2, off offset:16380 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
@@ -1434,7 +1434,7 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4294967296(ptr addrspace(
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX12-NEXT:    global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
@@ -1486,7 +1486,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_vgpr_offset(ptr addrspace(1) in
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
 ; GFX12-NEXT:    global_atomic_add_u32 v0, v[0:1], v4, off th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset
@@ -1530,7 +1530,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4095(ptr addrspace(1) inreg
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v0, v[1:2], s[2:3] offset:16380 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
@@ -1583,7 +1583,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4294967296(ptr addrspace(1)
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[3:4], v[1:2], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
@@ -1625,7 +1625,7 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4095(ptr addrspace(1) %ptr,
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off offset:16380 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
@@ -1673,7 +1673,7 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4294967296(ptr addrspace(1)
 ; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v5
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v6, vcc_lo
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
@@ -1726,7 +1726,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_vgpr_offset(ptr addrspace(1) inre
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 0840f58ecd1a61..12d00f8cfd9c96 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -73,7 +73,11 @@ define i16 @v_mul_i16(i16 %num, i16 %den) {
 ;
 ; GFX12-LABEL: v_mul_i16:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %result = mul i16 %num, %den
@@ -152,7 +156,11 @@ define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
 ;
 ; GFX12-LABEL: v_mul_i16_zeroext:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -237,7 +245,11 @@ define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) {
 ;
 ; GFX12-LABEL: v_mul_i16_signext:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 16
@@ -280,7 +292,11 @@ define i32 @v_mul_i32(i32 %num, i32 %den) {
 ;
 ; GFX12-LABEL: v_mul_i32:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %result = mul i32 %num, %den
@@ -326,7 +342,11 @@ define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) {
 ;
 ; GFX12-LABEL: v_mul_v2i32:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mul_lo_u32 v0, v0, v2
 ; GFX12-NEXT:    v_mul_lo_u32 v1, v1, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
@@ -480,7 +500,11 @@ define i64 @v_mul_i64(i64 %num, i64 %den) {
 ;
 ; GFX12-LABEL: v_mul_i64:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mul_hi_u32 v4, v0, v2
 ; GFX12-NEXT:    v_mul_lo_u32 v3, v0, v3
 ; GFX12-NEXT:    v_mul_lo_u32 v1, v1, v2
@@ -653,7 +677,11 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
 ;
 ; GFX12-LABEL: v_mul_i96:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
 ; GFX12-NEXT:    v_mul_lo_u32 v2, v2, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -989,7 +1017,11 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ;
 ; GFX12-LABEL: v_mul_i128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX12-NEXT:    v_mul_lo_u32 v3, v3, v4
@@ -2352,7 +2384,11 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ;
 ; GFX12-LABEL: v_mul_i256:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
 ; GFX12-NEXT:    v_mul_lo_u32 v27, v6, v9
 ; GFX12-NEXT:    v_mul_lo_u32 v7, v7, v8
@@ -2496,7 +2532,7 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr
 ; GFX12-LABEL: s_mul_u64_zext_with_vregs:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b32 v2, v[2:3], off
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX12-NEXT:    s_nop 0
@@ -2591,10 +2627,10 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-NEXT:    s_mov_b32 s3, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mul_u64 s[2:3], s[2:3], 0x50
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
@@ -2673,7 +2709,7 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
 ; GFX12-LABEL: s_mul_u64_sext_with_vregs:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b32 v2, v[2:3], off
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX12-NEXT:    s_nop 0
@@ -2783,9 +2819,9 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_ashr_i32 s3, s2, 31
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_mul_u64 s[2:3], s[2:3], 0x50
diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll
index 34a676bffcfe31..78add099fc61b9 100644
--- a/llvm/test/CodeGen/AMDGPU/add.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.ll
@@ -74,9 +74,9 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GFX12-LABEL: s_add_i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_co_i32 s2, s2, s3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
@@ -170,9 +170,9 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX12-LABEL: s_add_v2i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_co_i32 s2, s4, s6
 ; GFX12-NEXT:    s_add_co_i32 s3, s5, s7
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -287,9 +287,9 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX12-LABEL: s_add_v4i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[8:11], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_co_i32 s3, s3, s7
 ; GFX12-NEXT:    s_add_co_i32 s2, s2, s6
 ; GFX12-NEXT:    s_add_co_i32 s0, s0, s4
@@ -454,7 +454,7 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b512 s[4:19], s[0:1], 0x44
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_co_i32 s2, s7, s15
 ; GFX12-NEXT:    s_add_co_i32 s3, s6, s14
 ; GFX12-NEXT:    s_add_co_i32 s6, s11, s19
@@ -728,7 +728,7 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
 ; GFX12-NEXT:    s_load_b512 s[4:19], s[0:1], 0x64
 ; GFX12-NEXT:    s_load_b512 s[36:51], s[0:1], 0xa4
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_co_i32 s2, s7, s39
 ; GFX12-NEXT:    s_add_co_i32 s3, s6, s38
 ; GFX12-NEXT:    s_add_co_i32 s6, s11, s43
@@ -859,11 +859,11 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] offset:4 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_add_nc_u32_e32 v0, v1, v0
 ; GFX12-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -956,9 +956,9 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_add_nc_u32_e32 v0, 0x7b, v0
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -1051,7 +1051,7 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[6:7], s[0:1]
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
@@ -1157,9 +1157,9 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[4:5]
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
@@ -1308,7 +1308,7 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12-LABEL: add64_in_branch:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX12-NEXT:    s_cbranch_scc0 .LBB9_4
 ; GFX12-NEXT:  ; %bb.1: ; %else
@@ -1318,7 +1318,7 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12-NEXT:  .LBB9_2: ; %if
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x0
 ; GFX12-NEXT:  .LBB9_3: ; %endif
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index e3d2ecefbda30d..12434fa0375139 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -234,17 +234,17 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12W64-NEXT:    s_mul_i32 s4, s4, 5
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX12W64-NEXT:  .LBB0_2:
 ; GFX12W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX12W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12W64-NEXT:    s_nop 0
 ; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -265,17 +265,17 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12W32-NEXT:    s_mul_i32 s3, s3, 5
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, s3
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB0_2:
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12W32-NEXT:    s_nop 0
 ; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -512,7 +512,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:  ; %bb.1:
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
 ; GFX12W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    s_mul_i32 s4, s6, s4
 ; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
@@ -520,9 +520,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:  .LBB1_2:
 ; GFX12W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX12W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    v_mul_lo_u32 v0, s6, v0
-; GFX12W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -545,7 +545,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W32-NEXT:  ; %bb.1:
 ; GFX12W32-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
 ; GFX12W32-NEXT:    s_bcnt1_i32_b32 s4, s4
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    s_mul_i32 s4, s2, s4
 ; GFX12W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, s4
@@ -553,9 +553,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W32-NEXT:  .LBB1_2:
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    v_mul_lo_u32 v0, s2, v0
-; GFX12W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12W32-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
@@ -858,17 +858,17 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:  ; %bb.3:
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
 ; GFX12W64-NEXT:    v_mov_b32_e32 v0, s4
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX12W64-NEXT:  .LBB2_4:
 ; GFX12W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX12W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX12W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_add_nc_u32_e32 v1, s2, v1
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12W64-NEXT:    s_nop 0
 ; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -901,16 +901,16 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:  ; %bb.3:
 ; GFX12W32-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
 ; GFX12W32-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB2_4:
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12W32-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12W32-NEXT:    s_nop 0
 ; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1231,18 +1231,18 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX12W64-NEXT:    s_load_b32 s5, s[0:1], 0x44
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
 ; GFX12W64-NEXT:    v_mov_b32_e32 v0, s4
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX12W64-NEXT:    buffer_atomic_add_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
 ; GFX12W64-NEXT:  .LBB3_4:
 ; GFX12W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX12W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX12W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_add_nc_u32_e32 v1, s2, v1
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12W64-NEXT:    s_nop 0
 ; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1277,17 +1277,17 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX12W32-NEXT:    s_load_b32 s8, s[0:1], 0x44
 ; GFX12W32-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
 ; GFX12W32-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX12W32-NEXT:    buffer_atomic_add_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB3_4:
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12W32-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12W32-NEXT:    s_nop 0
 ; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1369,10 +1369,10 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1609,18 +1609,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12W64-NEXT:    s_mul_i32 s4, s4, 5
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX12W64-NEXT:  .LBB5_2:
 ; GFX12W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX12W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX12W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12W64-NEXT:    s_nop 0
 ; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1641,18 +1641,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12W32-NEXT:    s_mul_i32 s3, s3, 5
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, s3
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB5_2:
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX12W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12W32-NEXT:    s_nop 0
 ; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1893,7 +1893,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:  ; %bb.1:
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
 ; GFX12W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    s_mul_i32 s4, s6, s4
 ; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
@@ -1901,9 +1901,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:  .LBB6_2:
 ; GFX12W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX12W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    v_mul_lo_u32 v0, s6, v0
-; GFX12W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -1926,7 +1926,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W32-NEXT:  ; %bb.1:
 ; GFX12W32-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
 ; GFX12W32-NEXT:    s_bcnt1_i32_b32 s4, s4
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    s_mul_i32 s4, s2, s4
 ; GFX12W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, s4
@@ -1934,9 +1934,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W32-NEXT:  .LBB6_2:
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    v_mul_lo_u32 v0, s2, v0
-; GFX12W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -2241,17 +2241,17 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:  ; %bb.3:
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
 ; GFX12W64-NEXT:    v_mov_b32_e32 v0, s4
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX12W64-NEXT:  .LBB7_4:
 ; GFX12W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX12W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX12W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_sub_nc_u32_e32 v1, s2, v1
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12W64-NEXT:    s_nop 0
 ; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2284,17 +2284,17 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:  ; %bb.3:
 ; GFX12W32-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
 ; GFX12W32-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB7_4:
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX12W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W32-NEXT:    v_sub_nc_u32_e32 v1, s2, v1
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12W32-NEXT:    s_nop 0
 ; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2376,10 +2376,10 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index b4c8da44337ae5..01e615d64083a0 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -235,15 +235,15 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1264-NEXT:    s_mul_i32 s6, s6, 5
 ; GFX1264-NEXT:    s_mov_b32 s10, -1
 ; GFX1264-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1264-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264-NEXT:    s_mov_b32 s8, s2
 ; GFX1264-NEXT:    s_mov_b32 s9, s3
 ; GFX1264-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX1264-NEXT:    s_waitcnt vmcnt(0)
+; GFX1264-NEXT:    s_wait_loadcnt 0x0
 ; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB0_2:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1264-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1264-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -270,15 +270,15 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1232-NEXT:    s_mul_i32 s5, s5, 5
 ; GFX1232-NEXT:    s_mov_b32 s10, -1
 ; GFX1232-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    s_mov_b32 s8, s2
 ; GFX1232-NEXT:    s_mov_b32 s9, s3
 ; GFX1232-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX1232-NEXT:    s_waitcnt vmcnt(0)
+; GFX1232-NEXT:    s_wait_loadcnt 0x0
 ; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB0_2:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1232-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -561,18 +561,18 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:  ; %bb.1:
 ; GFX1264-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX1264-NEXT:    s_mov_b32 s15, 0x31016000
-; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1264-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264-NEXT:    s_mul_i32 s2, s8, s2
 ; GFX1264-NEXT:    s_mov_b32 s14, -1
 ; GFX1264-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1264-NEXT:    s_mov_b32 s12, s6
 ; GFX1264-NEXT:    s_mov_b32 s13, s7
 ; GFX1264-NEXT:    buffer_atomic_add_u32 v1, off, s[12:15], null th:TH_ATOMIC_RETURN
-; GFX1264-NEXT:    s_waitcnt vmcnt(0)
+; GFX1264-NEXT:    s_wait_loadcnt 0x0
 ; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB1_2:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1264-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264-NEXT:    v_mul_lo_u32 v0, s8, v0
 ; GFX1264-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX1264-NEXT:    s_mov_b32 s7, 0x31016000
@@ -599,18 +599,18 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:  ; %bb.1:
 ; GFX1232-NEXT:    s_bcnt1_i32_b32 s2, s2
 ; GFX1232-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    s_mul_i32 s2, s0, s2
 ; GFX1232-NEXT:    s_mov_b32 s10, -1
 ; GFX1232-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1232-NEXT:    s_mov_b32 s8, s6
 ; GFX1232-NEXT:    s_mov_b32 s9, s7
 ; GFX1232-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX1232-NEXT:    s_waitcnt vmcnt(0)
+; GFX1232-NEXT:    s_wait_loadcnt 0x0
 ; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB1_2:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    v_mul_lo_u32 v0, s0, v0
 ; GFX1232-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX1232-NEXT:    s_mov_b32 s7, 0x31016000
@@ -958,15 +958,15 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1264-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1264-NEXT:    s_mov_b32 s10, -1
-; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1264-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264-NEXT:    s_mov_b32 s8, s2
 ; GFX1264-NEXT:    s_mov_b32 s9, s3
 ; GFX1264-NEXT:    buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX1264-NEXT:    s_waitcnt vmcnt(0)
+; GFX1264-NEXT:    s_wait_loadcnt 0x0
 ; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB2_4:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1264-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1264-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -1006,15 +1006,15 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1232-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1232-NEXT:    s_mov_b32 s10, -1
-; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    s_mov_b32 s8, s2
 ; GFX1232-NEXT:    s_mov_b32 s9, s3
 ; GFX1232-NEXT:    buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX1232-NEXT:    s_waitcnt vmcnt(0)
+; GFX1232-NEXT:    s_wait_loadcnt 0x0
 ; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB2_4:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1232-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -1274,15 +1274,15 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1264-NEXT:    s_mov_b32 s10, -1
 ; GFX1264-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1264-NEXT:    v_mov_b32_e32 v1, s7
-; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1264-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264-NEXT:    s_mov_b32 s8, s2
 ; GFX1264-NEXT:    s_mov_b32 s9, s3
 ; GFX1264-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX1264-NEXT:    s_waitcnt vmcnt(0)
+; GFX1264-NEXT:    s_wait_loadcnt 0x0
 ; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB3_2:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1264-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1264-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -1311,15 +1311,15 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1232-NEXT:    s_mul_u64 s[4:5], s[4:5], 5
 ; GFX1232-NEXT:    s_mov_b32 s10, -1
 ; GFX1232-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    s_mov_b32 s8, s2
 ; GFX1232-NEXT:    s_mov_b32 s9, s3
 ; GFX1232-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX1232-NEXT:    s_waitcnt vmcnt(0)
+; GFX1232-NEXT:    s_wait_loadcnt 0x0
 ; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB3_2:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1232-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -1655,7 +1655,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1264-NEXT:  ; %bb.1:
 ; GFX1264-NEXT:    s_bcnt1_i32_b64 s10, s[8:9]
-; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1264-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264-NEXT:    s_mul_u64 s[8:9], s[0:1], s[10:11]
 ; GFX1264-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1264-NEXT:    v_mov_b32_e32 v0, s8
@@ -1664,13 +1664,13 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    s_mov_b32 s8, s6
 ; GFX1264-NEXT:    s_mov_b32 s9, s7
 ; GFX1264-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX1264-NEXT:    s_waitcnt vmcnt(0)
+; GFX1264-NEXT:    s_wait_loadcnt 0x0
 ; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB4_2:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1264-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1264-NEXT:    v_readfirstlane_b32 s3, v1
-; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1264-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264-NEXT:    v_mul_lo_u32 v3, s1, v2
 ; GFX1264-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1264-NEXT:    s_mov_b32 s6, -1
@@ -1698,20 +1698,20 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:  ; %bb.1:
 ; GFX1232-NEXT:    s_bcnt1_i32_b32 s2, s2
 ; GFX1232-NEXT:    s_mov_b32 s15, 0x31016000
-; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    s_mul_u64 s[2:3], s[0:1], s[2:3]
 ; GFX1232-NEXT:    s_mov_b32 s14, -1
 ; GFX1232-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX1232-NEXT:    s_mov_b32 s12, s6
 ; GFX1232-NEXT:    s_mov_b32 s13, s7
 ; GFX1232-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN
-; GFX1232-NEXT:    s_waitcnt vmcnt(0)
+; GFX1232-NEXT:    s_wait_loadcnt 0x0
 ; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB4_2:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX1232-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1232-NEXT:    v_readfirstlane_b32 s3, v1
-; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    v_mul_lo_u32 v3, s1, v2
 ; GFX1232-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1232-NEXT:    s_mov_b32 s6, -1
@@ -1817,12 +1817,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    s_mov_b32 s11, s7
 ; GFX12-NEXT:    s_mov_b32 s10, s6
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s8, s2
 ; GFX12-NEXT:    s_mov_b32 s9, s3
 ; GFX12-NEXT:    s_mov_b32 s4, s0
 ; GFX12-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_mov_b32 s5, s1
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
@@ -2098,15 +2098,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1264-NEXT:    s_mul_i32 s6, s6, 5
 ; GFX1264-NEXT:    s_mov_b32 s10, -1
 ; GFX1264-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1264-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264-NEXT:    s_mov_b32 s8, s2
 ; GFX1264-NEXT:    s_mov_b32 s9, s3
 ; GFX1264-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX1264-NEXT:    s_waitcnt vmcnt(0)
+; GFX1264-NEXT:    s_wait_loadcnt 0x0
 ; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB6_2:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1264-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1264-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
 ; GFX1264-NEXT:    s_mov_b32 s3, 0x31016000
@@ -2134,15 +2134,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1232-NEXT:    s_mul_i32 s5, s5, 5
 ; GFX1232-NEXT:    s_mov_b32 s10, -1
 ; GFX1232-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    s_mov_b32 s8, s2
 ; GFX1232-NEXT:    s_mov_b32 s9, s3
 ; GFX1232-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX1232-NEXT:    s_waitcnt vmcnt(0)
+; GFX1232-NEXT:    s_wait_loadcnt 0x0
 ; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB6_2:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1232-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
 ; GFX1232-NEXT:    s_mov_b32 s3, 0x31016000
@@ -2430,18 +2430,18 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:  ; %bb.1:
 ; GFX1264-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX1264-NEXT:    s_mov_b32 s15, 0x31016000
-; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1264-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264-NEXT:    s_mul_i32 s2, s8, s2
 ; GFX1264-NEXT:    s_mov_b32 s14, -1
 ; GFX1264-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1264-NEXT:    s_mov_b32 s12, s6
 ; GFX1264-NEXT:    s_mov_b32 s13, s7
 ; GFX1264-NEXT:    buffer_atomic_sub_u32 v1, off, s[12:15], null th:TH_ATOMIC_RETURN
-; GFX1264-NEXT:    s_waitcnt vmcnt(0)
+; GFX1264-NEXT:    s_wait_loadcnt 0x0
 ; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB7_2:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1264-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264-NEXT:    v_mul_lo_u32 v0, s8, v0
 ; GFX1264-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX1264-NEXT:    s_mov_b32 s7, 0x31016000
@@ -2468,18 +2468,18 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:  ; %bb.1:
 ; GFX1232-NEXT:    s_bcnt1_i32_b32 s2, s2
 ; GFX1232-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    s_mul_i32 s2, s0, s2
 ; GFX1232-NEXT:    s_mov_b32 s10, -1
 ; GFX1232-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1232-NEXT:    s_mov_b32 s8, s6
 ; GFX1232-NEXT:    s_mov_b32 s9, s7
 ; GFX1232-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX1232-NEXT:    s_waitcnt vmcnt(0)
+; GFX1232-NEXT:    s_wait_loadcnt 0x0
 ; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB7_2:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    v_mul_lo_u32 v0, s0, v0
 ; GFX1232-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX1232-NEXT:    s_mov_b32 s7, 0x31016000
@@ -2827,15 +2827,15 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1264-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1264-NEXT:    s_mov_b32 s10, -1
-; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1264-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264-NEXT:    s_mov_b32 s8, s2
 ; GFX1264-NEXT:    s_mov_b32 s9, s3
 ; GFX1264-NEXT:    buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX1264-NEXT:    s_waitcnt vmcnt(0)
+; GFX1264-NEXT:    s_wait_loadcnt 0x0
 ; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB8_4:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1264-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1264-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -2875,15 +2875,15 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1232-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1232-NEXT:    s_mov_b32 s10, -1
-; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    s_mov_b32 s8, s2
 ; GFX1232-NEXT:    s_mov_b32 s9, s3
 ; GFX1232-NEXT:    buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX1232-NEXT:    s_waitcnt vmcnt(0)
+; GFX1232-NEXT:    s_wait_loadcnt 0x0
 ; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB8_4:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1232-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -3194,15 +3194,15 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1264-NEXT:    s_mov_b32 s10, -1
 ; GFX1264-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1264-NEXT:    v_mov_b32_e32 v1, s7
-; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1264-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264-NEXT:    s_mov_b32 s8, s2
 ; GFX1264-NEXT:    s_mov_b32 s9, s3
 ; GFX1264-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX1264-NEXT:    s_waitcnt vmcnt(0)
+; GFX1264-NEXT:    s_wait_loadcnt 0x0
 ; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB9_2:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1264-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1264-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
 ; GFX1264-NEXT:    v_readfirstlane_b32 s3, v1
@@ -3234,15 +3234,15 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1232-NEXT:    s_mul_u64 s[4:5], s[4:5], 5
 ; GFX1232-NEXT:    s_mov_b32 s10, -1
 ; GFX1232-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    s_mov_b32 s8, s2
 ; GFX1232-NEXT:    s_mov_b32 s9, s3
 ; GFX1232-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX1232-NEXT:    s_waitcnt vmcnt(0)
+; GFX1232-NEXT:    s_wait_loadcnt 0x0
 ; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB9_2:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1232-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
 ; GFX1232-NEXT:    v_readfirstlane_b32 s3, v1
@@ -3594,7 +3594,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX1264-NEXT:  ; %bb.1:
 ; GFX1264-NEXT:    s_bcnt1_i32_b64 s10, s[8:9]
-; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1264-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264-NEXT:    s_mul_u64 s[8:9], s[0:1], s[10:11]
 ; GFX1264-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1264-NEXT:    v_mov_b32_e32 v0, s8
@@ -3603,11 +3603,11 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    s_mov_b32 s8, s6
 ; GFX1264-NEXT:    s_mov_b32 s9, s7
 ; GFX1264-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX1264-NEXT:    s_waitcnt vmcnt(0)
+; GFX1264-NEXT:    s_wait_loadcnt 0x0
 ; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB10_2:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1264-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1264-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264-NEXT:    v_mul_lo_u32 v5, s1, v2
 ; GFX1264-NEXT:    v_mad_co_u64_u32 v[3:4], null, s0, v2, 0
 ; GFX1264-NEXT:    v_readfirstlane_b32 s0, v0
@@ -3640,18 +3640,18 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:  ; %bb.1:
 ; GFX1232-NEXT:    s_bcnt1_i32_b32 s2, s2
 ; GFX1232-NEXT:    s_mov_b32 s15, 0x31016000
-; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    s_mul_u64 s[2:3], s[0:1], s[2:3]
 ; GFX1232-NEXT:    s_mov_b32 s14, -1
 ; GFX1232-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX1232-NEXT:    s_mov_b32 s12, s6
 ; GFX1232-NEXT:    s_mov_b32 s13, s7
 ; GFX1232-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN
-; GFX1232-NEXT:    s_waitcnt vmcnt(0)
+; GFX1232-NEXT:    s_wait_loadcnt 0x0
 ; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB10_2:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; GFX1232-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    v_mul_lo_u32 v5, s1, v2
 ; GFX1232-NEXT:    v_mad_co_u64_u32 v[3:4], null, s0, v2, 0
 ; GFX1232-NEXT:    v_readfirstlane_b32 s0, v0
@@ -3762,12 +3762,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    s_mov_b32 s11, s7
 ; GFX12-NEXT:    s_mov_b32 s10, s6
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s8, s2
 ; GFX12-NEXT:    s_mov_b32 s9, s3
 ; GFX12-NEXT:    s_mov_b32 s4, s0
 ; GFX12-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_mov_b32 s5, s1
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index 79f8b3a1d5d84c..57a0b1b3d2b13e 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -233,17 +233,17 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12W64-NEXT:    s_mul_i32 s4, s4, 5
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX12W64-NEXT:  .LBB0_2:
 ; GFX12W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX12W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12W64-NEXT:    s_nop 0
 ; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -264,17 +264,17 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12W32-NEXT:    s_mul_i32 s3, s3, 5
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, s3
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB0_2:
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12W32-NEXT:    s_nop 0
 ; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -511,7 +511,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:  ; %bb.1:
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
 ; GFX12W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    s_mul_i32 s4, s6, s4
 ; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
@@ -519,9 +519,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:  .LBB1_2:
 ; GFX12W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX12W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    v_mul_lo_u32 v0, s6, v0
-; GFX12W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -544,7 +544,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W32-NEXT:  ; %bb.1:
 ; GFX12W32-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
 ; GFX12W32-NEXT:    s_bcnt1_i32_b32 s4, s4
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    s_mul_i32 s4, s2, s4
 ; GFX12W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, s4
@@ -552,9 +552,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W32-NEXT:  .LBB1_2:
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    v_mul_lo_u32 v0, s2, v0
-; GFX12W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12W32-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
@@ -857,17 +857,17 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:  ; %bb.3:
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
 ; GFX12W64-NEXT:    v_mov_b32_e32 v0, s4
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX12W64-NEXT:  .LBB2_4:
 ; GFX12W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX12W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX12W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_add_nc_u32_e32 v1, s2, v1
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12W64-NEXT:    s_nop 0
 ; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -900,16 +900,16 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:  ; %bb.3:
 ; GFX12W32-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
 ; GFX12W32-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB2_4:
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12W32-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12W32-NEXT:    s_nop 0
 ; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -991,10 +991,10 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1231,18 +1231,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12W64-NEXT:    s_mul_i32 s4, s4, 5
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX12W64-NEXT:  .LBB4_2:
 ; GFX12W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX12W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX12W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12W64-NEXT:    s_nop 0
 ; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1263,18 +1263,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12W32-NEXT:    s_mul_i32 s3, s3, 5
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, s3
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB4_2:
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX12W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12W32-NEXT:    s_nop 0
 ; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1515,7 +1515,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:  ; %bb.1:
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
 ; GFX12W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    s_mul_i32 s4, s6, s4
 ; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
@@ -1523,9 +1523,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:  .LBB5_2:
 ; GFX12W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX12W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    v_mul_lo_u32 v0, s6, v0
-; GFX12W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -1548,7 +1548,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W32-NEXT:  ; %bb.1:
 ; GFX12W32-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
 ; GFX12W32-NEXT:    s_bcnt1_i32_b32 s4, s4
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    s_mul_i32 s4, s2, s4
 ; GFX12W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, s4
@@ -1556,9 +1556,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W32-NEXT:  .LBB5_2:
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    v_mul_lo_u32 v0, s2, v0
-; GFX12W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -1863,17 +1863,17 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:  ; %bb.3:
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
 ; GFX12W64-NEXT:    v_mov_b32_e32 v0, s4
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
 ; GFX12W64-NEXT:  .LBB6_4:
 ; GFX12W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX12W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX12W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_sub_nc_u32_e32 v1, s2, v1
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12W64-NEXT:    s_nop 0
 ; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1906,17 +1906,17 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:  ; %bb.3:
 ; GFX12W32-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
 ; GFX12W32-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB6_4:
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX12W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W32-NEXT:    v_sub_nc_u32_e32 v1, s2, v1
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12W32-NEXT:    s_nop 0
 ; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1998,10 +1998,10 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index edf6fbadf1a60a..f0d2e36c487f1f 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -241,17 +241,17 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W64-NEXT:    s_mul_i32 s4, s4, 5
 ; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
 ; GFX12W64-NEXT:  .LBB0_2:
 ; GFX12W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX12W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12W64-NEXT:    s_nop 0
 ; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -272,17 +272,17 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12W32-NEXT:    s_mul_i32 s3, s3, 5
 ; GFX12W32-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB0_2:
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12W32-NEXT:    s_nop 0
 ; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -527,7 +527,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
 ; GFX12W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
 ; GFX12W64-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    s_mul_i32 s4, s6, s4
 ; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
@@ -535,9 +535,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:  .LBB1_2:
 ; GFX12W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX12W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    v_mul_lo_u32 v0, s6, v0
-; GFX12W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -560,7 +560,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W32-NEXT:  ; %bb.1:
 ; GFX12W32-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
 ; GFX12W32-NEXT:    s_bcnt1_i32_b32 s4, s4
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    s_mul_i32 s4, s2, s4
 ; GFX12W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12W32-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
@@ -568,9 +568,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W32-NEXT:  .LBB1_2:
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    v_mul_lo_u32 v0, s2, v0
-; GFX12W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12W32-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
@@ -881,17 +881,17 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
 ; GFX12W64-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12W64-NEXT:    v_mov_b32_e32 v0, s4
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    buffer_atomic_add_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
 ; GFX12W64-NEXT:  .LBB2_4:
 ; GFX12W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX12W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX12W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_add_nc_u32_e32 v1, s2, v1
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12W64-NEXT:    s_nop 0
 ; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -925,16 +925,16 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
 ; GFX12W32-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12W32-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    buffer_atomic_add_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB2_4:
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12W32-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12W32-NEXT:    s_nop 0
 ; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1016,10 +1016,10 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_add_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1136,9 +1136,9 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX12W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12W64-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX12W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    buffer_atomic_add_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN
-; GFX12W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v0, v2, s[0:1]
 ; GFX12W64-NEXT:    s_nop 0
 ; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1150,9 +1150,9 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX12W32-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0
 ; GFX12W32-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    buffer_atomic_add_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN
-; GFX12W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v0, v2, s[0:1]
 ; GFX12W32-NEXT:    s_nop 0
 ; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1397,18 +1397,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W64-NEXT:    s_mul_i32 s4, s4, 5
 ; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
 ; GFX12W64-NEXT:  .LBB5_2:
 ; GFX12W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX12W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX12W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12W64-NEXT:    s_nop 0
 ; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1429,18 +1429,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12W32-NEXT:    s_mul_i32 s3, s3, 5
 ; GFX12W32-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    buffer_atomic_sub_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB5_2:
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX12W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12W32-NEXT:    s_nop 0
 ; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1689,7 +1689,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
 ; GFX12W64-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
 ; GFX12W64-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    s_mul_i32 s4, s6, s4
 ; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, s4
@@ -1697,9 +1697,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:  .LBB6_2:
 ; GFX12W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX12W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    v_mul_lo_u32 v0, s6, v0
-; GFX12W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -1722,7 +1722,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W32-NEXT:  ; %bb.1:
 ; GFX12W32-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
 ; GFX12W32-NEXT:    s_bcnt1_i32_b32 s4, s4
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    s_mul_i32 s4, s2, s4
 ; GFX12W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12W32-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
@@ -1730,9 +1730,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W32-NEXT:  .LBB6_2:
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    v_mul_lo_u32 v0, s2, v0
-; GFX12W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -2045,17 +2045,17 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
 ; GFX12W64-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12W64-NEXT:    v_mov_b32_e32 v0, s4
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    buffer_atomic_sub_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
 ; GFX12W64-NEXT:  .LBB7_4:
 ; GFX12W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX12W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX12W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_sub_nc_u32_e32 v1, s2, v1
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12W64-NEXT:    s_nop 0
 ; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2089,17 +2089,17 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
 ; GFX12W32-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12W32-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    buffer_atomic_sub_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:  .LBB7_4:
 ; GFX12W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX12W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W32-NEXT:    v_sub_nc_u32_e32 v1, s2, v1
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12W32-NEXT:    s_nop 0
 ; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2181,10 +2181,10 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_sub_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2301,9 +2301,9 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX12W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12W64-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX12W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    buffer_atomic_sub_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN
-; GFX12W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v0, v2, s[0:1]
 ; GFX12W64-NEXT:    s_nop 0
 ; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2315,9 +2315,9 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX12W32-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0
 ; GFX12W32-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX12W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    buffer_atomic_sub_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN
-; GFX12W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v0, v2, s[0:1]
 ; GFX12W32-NEXT:    s_nop 0
 ; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
index e18bdc89e7d421..d0ba606052a604 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
@@ -89,18 +89,22 @@ define float @syncscope_system(ptr %addr, float %val) #0 {
 ;
 ; GFX1200-LABEL: syncscope_system:
 ; GFX1200:       ; %bb.0:
-; GFX1200-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT:    s_wait_expcnt 0x0
+; GFX1200-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
 ; GFX1200-NEXT:    flat_load_b32 v3, v[0:1]
 ; GFX1200-NEXT:    s_mov_b32 s0, 0
 ; GFX1200-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX1200-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1200-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1200-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1200-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX1200-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1200-NEXT:    s_wait_storecnt 0x0
 ; GFX1200-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
-; GFX1200-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1200-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1200-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX1200-NEXT:    s_or_b32 s0, vcc_lo, s0
@@ -204,10 +208,14 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 {
 ;
 ; GFX1200-LABEL: syncscope_workgroup_rtn:
 ; GFX1200:       ; %bb.0:
-; GFX1200-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1200-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT:    s_wait_expcnt 0x0
+; GFX1200-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    s_wait_storecnt 0x0
 ; GFX1200-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX1200-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1200-NEXT:    global_inv scope:SCOPE_SE
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31]
   %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
@@ -334,11 +342,14 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 {
 ;
 ; GFX1200-LABEL: syncscope_workgroup_nortn:
 ; GFX1200:       ; %bb.0:
-; GFX1200-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1200-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT:    s_wait_expcnt 0x0
+; GFX1200-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    s_wait_storecnt 0x0
 ; GFX1200-NEXT:    flat_atomic_add_f32 v[0:1], v2
-; GFX1200-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1200-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1200-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1200-NEXT:    global_inv scope:SCOPE_SE
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31]
   %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
@@ -422,18 +433,22 @@ define float @no_unsafe(ptr %addr, float %val) {
 ;
 ; GFX1200-LABEL: no_unsafe:
 ; GFX1200:       ; %bb.0:
-; GFX1200-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT:    s_wait_expcnt 0x0
+; GFX1200-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
 ; GFX1200-NEXT:    flat_load_b32 v3, v[0:1]
 ; GFX1200-NEXT:    s_mov_b32 s0, 0
 ; GFX1200-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX1200-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1200-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1200-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1200-NEXT:    v_add_f32_e32 v3, v4, v2
-; GFX1200-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1200-NEXT:    s_wait_storecnt 0x0
 ; GFX1200-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
-; GFX1200-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1200-NEXT:    global_inv scope:SCOPE_SE
 ; GFX1200-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX1200-NEXT:    s_or_b32 s0, vcc_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll
index f5846c3d6db737..860bacf19f39dd 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll
@@ -34,7 +34,7 @@ define amdgpu_cs void @test_sink_smem_offset_400(ptr addrspace(4) inreg %ptr, i3
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:  .LBB0_1: ; %loop
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0x190
 ; GFX12-NEXT:    s_add_co_i32 s2, s2, -1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -100,7 +100,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000(ptr addrspace(4) inreg %ptr, i
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:  .LBB1_1: ; %loop
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0xfa0
 ; GFX12-NEXT:    s_add_co_i32 s2, s2, -1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -154,7 +154,7 @@ define amdgpu_cs void @test_sink_smem_offset_4000000(ptr addrspace(4) inreg %ptr
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:  .LBB2_1: ; %loop
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0x3d0900
 ; GFX12-NEXT:    s_add_co_i32 s2, s2, -1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -209,7 +209,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000(ptr addrspace(4) inreg %pt
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 0x2625a00
 ; GFX12-NEXT:  .LBB3_1: ; %loop
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0x0
 ; GFX12-NEXT:    s_add_co_i32 s2, s2, -1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -255,7 +255,7 @@ define amdgpu_cs void @test_sink_smem_offset_40000000000(ptr addrspace(4) inreg
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:  .LBB4_1: ; %loop
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0x0
 ; GFX12-NEXT:    s_add_co_i32 s2, s2, -1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -309,7 +309,7 @@ define amdgpu_cs void @test_sink_smem_offset_neg400(ptr addrspace(4) inreg %ptr,
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:  .LBB5_1: ; %loop
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s3, s[0:1], -0x190
 ; GFX12-NEXT:    s_add_co_i32 s2, s2, -1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index b95231fd8880f5..f2e28cc8e1bcd2 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -67,9 +67,9 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -148,9 +148,9 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, -v1, -v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -230,9 +230,9 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, -|v1|, -|v1| clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -322,9 +322,9 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_add_f32_e32 v1, 0.5, v1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_maxmin_num_f32 v1, v1, 0x80000000, 1.0
@@ -416,9 +416,9 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out,
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e32 v1, v1, v1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_maxmin_num_f32 v1, v1, 0x80000000, 1.0
@@ -517,16 +517,16 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e32 v1, v1, v1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_max_num_f32_e32 v1, 0, v1
 ; GFX12-NEXT:    v_min_num_f32_e32 v2, 1.0, v1
 ; GFX12-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX12-NEXT:    global_store_b32 v[0:1], v1, off th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    global_store_b32 v[0:1], v1, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -605,9 +605,9 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f16_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -687,9 +687,9 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f16_e64 v1, -v1, -v1 clamp
 ; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -770,9 +770,9 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f16_e64 v1, -|v1|, -|v1| clamp
 ; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -854,9 +854,9 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1] clamp
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -935,9 +935,9 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -1017,9 +1017,9 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -1104,9 +1104,9 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_med3_num_f32 v1, 0x80000000, 1.0, v1
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -1183,9 +1183,9 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -1262,9 +1262,9 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -1341,9 +1341,9 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -1420,9 +1420,9 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -1499,9 +1499,9 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -1578,9 +1578,9 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -1643,7 +1643,7 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) #
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 1.0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1702,7 +1702,7 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out)
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1762,7 +1762,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out)
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0.5 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1822,7 +1822,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1)
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1881,7 +1881,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 {
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1940,7 +1940,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 {
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2023,9 +2023,9 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_add_f32_e64 v1, v1, 0.5 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -2105,9 +2105,9 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_add_f32_e64 v1, v1, 0.5 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -2192,9 +2192,9 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out,
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -2278,9 +2278,9 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace(
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_add_f32_e64 v1, v1, 1.0 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -2360,9 +2360,9 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) %
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -2439,9 +2439,9 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) %
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -2518,9 +2518,9 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) %
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -2597,9 +2597,9 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) %
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -2676,9 +2676,9 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) %
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -2755,9 +2755,9 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) %
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -2820,7 +2820,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2880,7 +2880,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2962,9 +2962,9 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -3062,9 +3062,9 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -3161,9 +3161,9 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, 2.0
@@ -3262,9 +3262,9 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, 0
@@ -3355,9 +3355,9 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -3449,9 +3449,9 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
@@ -3544,9 +3544,9 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -3636,9 +3636,9 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1 neg_hi:[1,1] clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -3728,9 +3728,9 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -3829,9 +3829,9 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -3929,9 +3929,9 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -4023,9 +4023,9 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b96 s[4:6], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_f32 s2, s4, s5
 ; GFX12-NEXT:    s_add_f32 s3, s4, s6
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
index a8dcb74c7d5f92..4bfaa6e90bdfee 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
@@ -51,9 +51,13 @@ define i32 @global_load_2xi16_align2(ptr addrspace(1) %p) #0 {
 ;
 ; GFX12-LABEL: global_load_2xi16_align2:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1
   %p.0 = load i16, ptr addrspace(1) %p, align 2
@@ -125,7 +129,7 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -193,9 +197,13 @@ define i32 @global_load_2xi16_align1(ptr addrspace(1) %p) #0 {
 ;
 ; GFX12-LABEL: global_load_2xi16_align1:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1
   %p.0 = load i16, ptr addrspace(1) %p, align 1
@@ -278,7 +286,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -328,9 +336,13 @@ define i32 @global_load_2xi16_align4(ptr addrspace(1) %p) #0 {
 ;
 ; GFX12-LABEL: global_load_2xi16_align4:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1
   %p.0 = load i16, ptr addrspace(1) %p, align 4
@@ -396,7 +408,7 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
index 61f0e5e2a88928..f9694dcd89abfb 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
@@ -75,16 +75,24 @@ define i32 @private_load_2xi16_align2(ptr addrspace(5) %p) #0 {
 ;
 ; GFX12-LABEL: private_load_2xi16_align2:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v0, off
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FLASTSCR-LABEL: private_load_2xi16_align2:
 ; GFX12-FLASTSCR:       ; %bb.0:
-; GFX12-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-FLASTSCR-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FLASTSCR-NEXT:    s_wait_expcnt 0x0
+; GFX12-FLASTSCR-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FLASTSCR-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FLASTSCR-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FLASTSCR-NEXT:    scratch_load_b32 v0, v0, off
-; GFX12-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-FLASTSCR-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1
   %p.0 = load i16, ptr addrspace(5) %p, align 2
@@ -163,14 +171,22 @@ define void @private_store_2xi16_align2(ptr addrspace(5) %p, ptr addrspace(5) %r
 ;
 ; GFX12-LABEL: private_store_2xi16_align2:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX12-NEXT:    scratch_store_b32 v1, v0, off
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FLASTSCR-LABEL: private_store_2xi16_align2:
 ; GFX12-FLASTSCR:       ; %bb.0:
-; GFX12-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-FLASTSCR-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FLASTSCR-NEXT:    s_wait_expcnt 0x0
+; GFX12-FLASTSCR-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FLASTSCR-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FLASTSCR-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX12-FLASTSCR-NEXT:    scratch_store_b32 v1, v0, off
 ; GFX12-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
@@ -255,16 +271,24 @@ define i32 @private_load_2xi16_align1(ptr addrspace(5) %p) #0 {
 ;
 ; GFX12-LABEL: private_load_2xi16_align1:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v0, off
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FLASTSCR-LABEL: private_load_2xi16_align1:
 ; GFX12-FLASTSCR:       ; %bb.0:
-; GFX12-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-FLASTSCR-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FLASTSCR-NEXT:    s_wait_expcnt 0x0
+; GFX12-FLASTSCR-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FLASTSCR-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FLASTSCR-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FLASTSCR-NEXT:    scratch_load_b32 v0, v0, off
-; GFX12-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-FLASTSCR-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1
   %p.0 = load i16, ptr addrspace(5) %p, align 1
@@ -348,14 +372,22 @@ define void @private_store_2xi16_align1(ptr addrspace(5) %p, ptr addrspace(5) %r
 ;
 ; GFX12-LABEL: private_store_2xi16_align1:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX12-NEXT:    scratch_store_b32 v1, v0, off
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FLASTSCR-LABEL: private_store_2xi16_align1:
 ; GFX12-FLASTSCR:       ; %bb.0:
-; GFX12-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-FLASTSCR-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FLASTSCR-NEXT:    s_wait_expcnt 0x0
+; GFX12-FLASTSCR-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FLASTSCR-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FLASTSCR-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX12-FLASTSCR-NEXT:    scratch_store_b32 v1, v0, off
 ; GFX12-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
@@ -425,16 +457,24 @@ define i32 @private_load_2xi16_align4(ptr addrspace(5) %p) #0 {
 ;
 ; GFX12-LABEL: private_load_2xi16_align4:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v0, off
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FLASTSCR-LABEL: private_load_2xi16_align4:
 ; GFX12-FLASTSCR:       ; %bb.0:
-; GFX12-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-FLASTSCR-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FLASTSCR-NEXT:    s_wait_expcnt 0x0
+; GFX12-FLASTSCR-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FLASTSCR-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FLASTSCR-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FLASTSCR-NEXT:    scratch_load_b32 v0, v0, off
-; GFX12-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-FLASTSCR-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1
   %p.0 = load i16, ptr addrspace(5) %p, align 4
@@ -510,14 +550,22 @@ define void @private_store_2xi16_align4(ptr addrspace(5) %p, ptr addrspace(5) %r
 ;
 ; GFX12-LABEL: private_store_2xi16_align4:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX12-NEXT:    scratch_store_b32 v1, v0, off
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FLASTSCR-LABEL: private_store_2xi16_align4:
 ; GFX12-FLASTSCR:       ; %bb.0:
-; GFX12-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-FLASTSCR-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FLASTSCR-NEXT:    s_wait_expcnt 0x0
+; GFX12-FLASTSCR-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FLASTSCR-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FLASTSCR-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX12-FLASTSCR-NEXT:    scratch_store_b32 v1, v0, off
 ; GFX12-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
index 93e8630dc7f560..4d432fbcfef973 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
@@ -27,7 +27,7 @@ define amdgpu_ps void @test_scratch_load_i8_zext_v(ptr addrspace(5) %in, ptr %ou
 ; GFX12-LABEL: test_scratch_load_i8_zext_v:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1
@@ -59,7 +59,7 @@ define amdgpu_ps void @test_scratch_load_i8_sext_v(ptr addrspace(5) %in, ptr %ou
 ; GFX12-LABEL: test_scratch_load_i8_sext_v:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    scratch_load_i8 v0, v0, off offset:1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1
@@ -91,7 +91,7 @@ define amdgpu_ps void @test_scratch_load_i16_zext_v(ptr addrspace(5) %in, ptr %o
 ; GFX12-LABEL: test_scratch_load_i16_zext_v:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    scratch_load_u16 v0, v0, off offset:2
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1
@@ -123,7 +123,7 @@ define amdgpu_ps void @test_scratch_load_i16_sext_v(ptr addrspace(5) %in, ptr %o
 ; GFX12-LABEL: test_scratch_load_i16_sext_v:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    scratch_load_i16 v0, v0, off offset:2
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1
@@ -159,7 +159,7 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_v(ptr addrspace(5) %i
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffff0000
 ; GFX12-NEXT:    scratch_load_d16_u8 v3, v0, off offset:1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -197,7 +197,7 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_v(ptr addrspace(5) %i
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffff0000
 ; GFX12-NEXT:    scratch_load_d16_i8 v3, v0, off offset:1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -235,7 +235,7 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_v(ptr addrspace(5) %in, p
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffff0000
 ; GFX12-NEXT:    scratch_load_d16_b16 v3, v0, off offset:2
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -273,7 +273,7 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_v(ptr addrspace(5) %i
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_mov_b32_e32 v3, -1
 ; GFX12-NEXT:    scratch_load_d16_hi_u8 v3, v0, off offset:1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -311,7 +311,7 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_v(ptr addrspace(5) %i
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_mov_b32_e32 v3, -1
 ; GFX12-NEXT:    scratch_load_d16_hi_i8 v3, v0, off offset:1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -349,7 +349,7 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_v(ptr addrspace(5) %in, p
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_mov_b32_e32 v3, -1
 ; GFX12-NEXT:    scratch_load_d16_hi_b16 v3, v0, off offset:2
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -384,7 +384,7 @@ define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_v(ptr %in, ptr addrspac
 ; GFX12-LABEL: test_scratch_store_b8_from_d16_hi_v:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    flat_load_b32 v0, v[0:1]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    scratch_store_d16_hi_b8 v2, v0, off offset:4
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -419,7 +419,7 @@ define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_v(ptr %in, ptr addrspa
 ; GFX12-LABEL: test_scratch_store_b16_from_d16_hi_v:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    flat_load_b32 v0, v[0:1]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    scratch_store_d16_hi_b16 v2, v0, off offset:2
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -457,7 +457,7 @@ define amdgpu_ps void @test_scratch_load_i8_zext_s(ptr addrspace(5) inreg %in, p
 ; GFX12-LABEL: test_scratch_load_i8_zext_s:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    scratch_load_u8 v2, off, s0 offset:1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1
@@ -489,7 +489,7 @@ define amdgpu_ps void @test_scratch_load_i8_sext_s(ptr addrspace(5) inreg %in, p
 ; GFX12-LABEL: test_scratch_load_i8_sext_s:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    scratch_load_i8 v2, off, s0 offset:1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1
@@ -521,7 +521,7 @@ define amdgpu_ps void @test_scratch_load_i16_zext_s(ptr addrspace(5) inreg %in,
 ; GFX12-LABEL: test_scratch_load_i16_zext_s:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    scratch_load_u16 v2, off, s0 offset:2
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1
@@ -553,7 +553,7 @@ define amdgpu_ps void @test_scratch_load_i16_sext_s(ptr addrspace(5) inreg %in,
 ; GFX12-LABEL: test_scratch_load_i16_sext_s:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    scratch_load_i16 v2, off, s0 offset:2
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1
@@ -590,7 +590,7 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_s(ptr addrspace(5) in
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0xffff0000
 ; GFX12-NEXT:    scratch_load_d16_u8 v2, off, s0 offset:1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -629,7 +629,7 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_s(ptr addrspace(5) in
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0xffff0000
 ; GFX12-NEXT:    scratch_load_d16_i8 v2, off, s0 offset:1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -668,7 +668,7 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_s(ptr addrspace(5) inreg
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0xffff0000
 ; GFX12-NEXT:    scratch_load_d16_b16 v2, off, s0 offset:2
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -707,7 +707,7 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_s(ptr addrspace(5) in
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_mov_b32_e32 v2, -1
 ; GFX12-NEXT:    scratch_load_d16_hi_u8 v2, off, s0 offset:1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -746,7 +746,7 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_s(ptr addrspace(5) in
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_mov_b32_e32 v2, -1
 ; GFX12-NEXT:    scratch_load_d16_hi_i8 v2, off, s0 offset:1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -785,7 +785,7 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_s(ptr addrspace(5) inreg
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_mov_b32_e32 v2, -1
 ; GFX12-NEXT:    scratch_load_d16_hi_b16 v2, off, s0 offset:2
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -820,7 +820,7 @@ define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_s(ptr %in, ptr addrspac
 ; GFX12-LABEL: test_scratch_store_b8_from_d16_hi_s:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    flat_load_b32 v0, v[0:1]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    scratch_store_d16_hi_b8 off, v0, s0 offset:4
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -855,7 +855,7 @@ define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_s(ptr %in, ptr addrspa
 ; GFX12-LABEL: test_scratch_store_b16_from_d16_hi_s:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    flat_load_b32 v0, v[0:1]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    scratch_store_d16_hi_b16 off, v0, s0 offset:2
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -896,7 +896,7 @@ define amdgpu_ps void @test_scratch_load_i8_zext_svs(ptr addrspace(5) inreg %in,
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX12-NEXT:    s_endpgm
   %voffset4 = mul i32 %voffset, 4
@@ -933,7 +933,7 @@ define amdgpu_ps void @test_scratch_load_i8_sext_svs(ptr addrspace(5) inreg %in,
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX12-NEXT:    scratch_load_i8 v0, v0, off offset:1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX12-NEXT:    s_endpgm
   %voffset4 = mul i32 %voffset, 4
@@ -970,7 +970,7 @@ define amdgpu_ps void @test_scratch_load_i16_zext_svs(ptr addrspace(5) inreg %in
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX12-NEXT:    scratch_load_u16 v0, v0, off offset:2
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX12-NEXT:    s_endpgm
   %voffset4 = mul i32 %voffset, 4
@@ -1007,7 +1007,7 @@ define amdgpu_ps void @test_scratch_load_i16_sext_svs(ptr addrspace(5) inreg %in
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX12-NEXT:    scratch_load_i16 v0, v0, off offset:2
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX12-NEXT:    s_endpgm
   %voffset4 = mul i32 %voffset, 4
@@ -1049,7 +1049,7 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_svs(ptr addrspace(5)
 ; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffff0000
 ; GFX12-NEXT:    scratch_load_d16_u8 v3, v0, off offset:1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -1093,7 +1093,7 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_svs(ptr addrspace(5)
 ; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffff0000
 ; GFX12-NEXT:    scratch_load_d16_i8 v3, v0, off offset:1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -1137,7 +1137,7 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_svs(ptr addrspace(5) inre
 ; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffff0000
 ; GFX12-NEXT:    scratch_load_d16_b16 v3, v0, off offset:2
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -1181,7 +1181,7 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_svs(ptr addrspace(5)
 ; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v3, -1
 ; GFX12-NEXT:    scratch_load_d16_hi_u8 v3, v0, off offset:1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -1225,7 +1225,7 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_svs(ptr addrspace(5)
 ; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v3, -1
 ; GFX12-NEXT:    scratch_load_d16_hi_i8 v3, v0, off offset:1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -1269,7 +1269,7 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_svs(ptr addrspace(5) inre
 ; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v3, -1
 ; GFX12-NEXT:    scratch_load_d16_hi_b16 v3, v0, off offset:2
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -1310,7 +1310,7 @@ define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_svs(ptr %in, ptr addrsp
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    flat_load_b32 v0, v[0:1]
 ; GFX12-NEXT:    v_lshl_add_u32 v1, v2, 2, s0
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    scratch_store_d16_hi_b8 v1, v0, off offset:4
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -1351,7 +1351,7 @@ define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_svs(ptr %in, ptr addrs
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    flat_load_b32 v0, v[0:1]
 ; GFX12-NEXT:    v_lshl_add_u32 v1, v2, 2, s0
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    scratch_store_d16_hi_b16 v1, v0, off offset:2
 ; GFX12-NEXT:    s_endpgm
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
index 04eb6dcff4632b..2fd0367f599068 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
@@ -94,14 +94,14 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v3, 4
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
-; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
-; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
-; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: soff1_voff1:
@@ -109,14 +109,14 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v3, 4
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
-; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
-; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
-; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    s_endpgm
 bb:
   %soff1 = mul i32 %soff, 1
@@ -220,15 +220,15 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
-; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
-; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
-; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: soff1_voff2:
@@ -236,15 +236,15 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
-; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
-; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
-; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    s_endpgm
 bb:
   %soff1 = mul i32 %soff, 1
@@ -348,15 +348,15 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
-; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
-; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
-; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: soff1_voff4:
@@ -364,15 +364,15 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
-; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
-; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
-; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    s_endpgm
 bb:
   %soff1 = mul i32 %soff, 1
@@ -477,16 +477,16 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v3, 4
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
-; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
-; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
-; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: soff2_voff1:
@@ -494,16 +494,16 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v3, 4
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
-; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
-; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
-; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    s_endpgm
 bb:
   %soff2 = mul i32 %soff, 2
@@ -611,16 +611,16 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
-; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
-; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
-; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: soff2_voff2:
@@ -628,16 +628,16 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
-; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
-; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
-; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    s_endpgm
 bb:
   %soff2 = mul i32 %soff, 2
@@ -745,16 +745,16 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
-; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
-; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
-; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: soff2_voff4:
@@ -762,16 +762,16 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
-; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
-; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
-; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    s_endpgm
 bb:
   %soff2 = mul i32 %soff, 2
@@ -876,16 +876,16 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v3, 4
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
-; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
-; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
-; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: soff4_voff1:
@@ -893,16 +893,16 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v3, 4
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
-; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
-; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
-; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    s_endpgm
 bb:
   %soff4 = mul i32 %soff, 4
@@ -1010,16 +1010,16 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
-; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
-; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
-; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: soff4_voff2:
@@ -1027,16 +1027,16 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
-; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
-; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
-; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    s_endpgm
 bb:
   %soff4 = mul i32 %soff, 4
@@ -1142,16 +1142,16 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
-; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
-; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
-; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: soff4_voff4:
@@ -1159,16 +1159,16 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
-; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
-; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
-; GFX12-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    s_endpgm
 bb:
   %soff4 = mul i32 %soff, 4
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 98379f5e3c68b4..1b9f2a688a1901 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -259,7 +259,11 @@ define void @zero_init_foo() {
 ;
 ; GFX12-LABEL: zero_init_foo:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_mov_b32 s1, s0
@@ -344,7 +348,11 @@ define void @zero_init_foo() {
 ;
 ; GFX12-PAL-LABEL: zero_init_foo:
 ; GFX12-PAL:       ; %bb.0:
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-PAL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-PAL-NEXT:    s_wait_expcnt 0x0
+; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX12-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-PAL-NEXT:    s_mov_b32 s1, s0
@@ -422,16 +430,16 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 15
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX12-NEXT:    s_add_co_i32 s0, s0, 4
 ; GFX12-NEXT:    s_add_co_i32 s1, s1, 4
-; GFX12-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: store_load_sindex_kernel:
@@ -517,16 +525,16 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX12-PAL:       ; %bb.0: ; %bb
 ; GFX12-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v0, 15
-; GFX12-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX12-PAL-NEXT:    s_add_co_i32 s0, s0, 4
 ; GFX12-PAL-NEXT:    s_add_co_i32 s1, s1, 4
-; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
-; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_endpgm
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -595,10 +603,10 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX12-NEXT:    s_add_co_i32 s0, s0, 4
 ; GFX12-NEXT:    s_add_co_i32 s1, s1, 4
-; GFX12-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: store_load_sindex_foo:
@@ -681,10 +689,10 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX12-PAL-NEXT:    s_add_co_i32 s0, s0, 4
 ; GFX12-PAL-NEXT:    s_add_co_i32 s1, s1, 4
-; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
-; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_endpgm
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -743,10 +751,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_sub_nc_u32_e32 v2, 4, v0
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:4 th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:124 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: store_load_vindex_kernel:
@@ -816,10 +824,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-PAL-NEXT:    v_sub_nc_u32_e32 v2, 4, v0
-; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:4 th:TH_STORE_NT_RT
-; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_endpgm
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -876,15 +884,19 @@ define void @store_load_vindex_foo(i32 %idx) {
 ;
 ; GFX12-LABEL: store_load_vindex_foo:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
 ; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s32
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX12-NEXT:    scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b32 v0, v2, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: store_load_vindex_foo:
@@ -943,15 +955,19 @@ define void @store_load_vindex_foo(i32 %idx) {
 ;
 ; GFX12-PAL-LABEL: store_load_vindex_foo:
 ; GFX12-PAL:       ; %bb.0: ; %bb
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-PAL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-PAL-NEXT:    s_wait_expcnt 0x0
+; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
 ; GFX12-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s32
 ; GFX12-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX12-PAL-NEXT:    scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT
-; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, v1, s32 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v2, off scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -988,7 +1004,11 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
 ;
 ; GFX12-LABEL: private_ptr_foo:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0x41200000
 ; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
@@ -1025,7 +1045,11 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
 ;
 ; GFX12-PAL-LABEL: private_ptr_foo:
 ; GFX12-PAL:       ; %bb.0:
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-PAL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-PAL-NEXT:    s_wait_expcnt 0x0
+; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
 ; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:4
 ; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
@@ -1097,8 +1121,8 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ;
 ; GFX12-LABEL: zero_init_small_offset_kernel:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_mov_b32 s1, s0
@@ -1228,8 +1252,8 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ;
 ; GFX12-PAL-LABEL: zero_init_small_offset_kernel:
 ; GFX12-PAL:       ; %bb.0:
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX12-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-PAL-NEXT:    s_mov_b32 s1, s0
@@ -1312,9 +1336,13 @@ define void @zero_init_small_offset_foo() {
 ;
 ; GFX12-LABEL: zero_init_small_offset_foo:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    scratch_load_b32 v0, off, s32 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, off, s32 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_mov_b32 s1, s0
@@ -1407,9 +1435,13 @@ define void @zero_init_small_offset_foo() {
 ;
 ; GFX12-PAL-LABEL: zero_init_small_offset_foo:
 ; GFX12-PAL:       ; %bb.0:
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s32 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-PAL-NEXT:    s_wait_expcnt 0x0
+; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s32 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX12-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-PAL-NEXT:    s_mov_b32 s1, s0
@@ -1495,19 +1527,19 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX12-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 15
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX12-NEXT:    s_addk_co_i32 s0, 0x104
 ; GFX12-NEXT:    s_addk_co_i32 s1, 0x104
-; GFX12-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel:
@@ -1629,19 +1661,19 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX12-PAL-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX12-PAL:       ; %bb.0: ; %bb
 ; GFX12-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v0, 15
-; GFX12-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX12-PAL-NEXT:    s_addk_co_i32 s0, 0x104
 ; GFX12-PAL-NEXT:    s_addk_co_i32 s1, 0x104
-; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
-; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_endpgm
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
@@ -1714,18 +1746,18 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ;
 ; GFX12-LABEL: store_load_sindex_small_offset_foo:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX12-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX12-NEXT:    s_addk_co_i32 s0, 0x104
 ; GFX12-NEXT:    s_addk_co_i32 s1, 0x104
-; GFX12-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo:
@@ -1837,18 +1869,18 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ;
 ; GFX12-PAL-LABEL: store_load_sindex_small_offset_foo:
 ; GFX12-PAL:       ; %bb.0: ; %bb
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX12-PAL-NEXT:    s_addk_co_i32 s0, 0x104
 ; GFX12-PAL-NEXT:    s_addk_co_i32 s1, 0x104
-; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
-; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_endpgm
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
@@ -1914,13 +1946,13 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX12-LABEL: store_load_vindex_small_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-NEXT:    scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_sub_nc_u32_e32 v2, 0x104, v0
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:260 th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:124 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:260 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel:
@@ -2020,13 +2052,13 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX12-PAL-LABEL: store_load_vindex_small_offset_kernel:
 ; GFX12-PAL:       ; %bb.0: ; %bb
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x104, v0
-; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:260 th:TH_STORE_NT_RT
-; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:260 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_endpgm
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
@@ -2095,17 +2127,21 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ;
 ; GFX12-LABEL: store_load_vindex_small_offset_foo:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
 ; GFX12-NEXT:    s_add_co_i32 s0, s32, 0x100
-; GFX12-NEXT:    scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX12-NEXT:    scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b32 v0, v2, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo:
@@ -2176,17 +2212,21 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ;
 ; GFX12-PAL-LABEL: store_load_vindex_small_offset_foo:
 ; GFX12-PAL:       ; %bb.0: ; %bb
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-PAL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-PAL-NEXT:    s_wait_expcnt 0x0
+; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
 ; GFX12-PAL-NEXT:    s_add_co_i32 s0, s32, 0x100
-; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX12-PAL-NEXT:    scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT
-; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v2, off scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
@@ -2267,8 +2307,8 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ;
 ; GFX12-LABEL: zero_init_large_offset_kernel:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_mov_b32 s1, s0
@@ -2403,8 +2443,8 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ;
 ; GFX12-PAL-LABEL: zero_init_large_offset_kernel:
 ; GFX12-PAL:       ; %bb.0:
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX12-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-PAL-NEXT:    s_mov_b32 s1, s0
@@ -2498,9 +2538,13 @@ define void @zero_init_large_offset_foo() {
 ;
 ; GFX12-LABEL: zero_init_large_offset_foo:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    scratch_load_b32 v0, off, s32 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, off, s32 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_mov_b32 s1, s0
@@ -2634,9 +2678,13 @@ define void @zero_init_large_offset_foo() {
 ;
 ; GFX12-PAL-LABEL: zero_init_large_offset_foo:
 ; GFX12-PAL:       ; %bb.0:
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s32 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-PAL-NEXT:    s_wait_expcnt 0x0
+; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s32 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX12-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-PAL-NEXT:    s_mov_b32 s1, s0
@@ -2722,19 +2770,19 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX12-LABEL: store_load_sindex_large_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 15
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX12-NEXT:    s_addk_co_i32 s0, 0x4004
 ; GFX12-NEXT:    s_addk_co_i32 s1, 0x4004
-; GFX12-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel:
@@ -2856,19 +2904,19 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX12-PAL-LABEL: store_load_sindex_large_offset_kernel:
 ; GFX12-PAL:       ; %bb.0: ; %bb
 ; GFX12-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v0, 15
-; GFX12-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX12-PAL-NEXT:    s_addk_co_i32 s0, 0x4004
 ; GFX12-PAL-NEXT:    s_addk_co_i32 s1, 0x4004
-; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
-; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_endpgm
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
@@ -2941,18 +2989,18 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
 ;
 ; GFX12-LABEL: store_load_sindex_large_offset_foo:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX12-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX12-NEXT:    s_addk_co_i32 s0, 0x4004
 ; GFX12-NEXT:    s_addk_co_i32 s1, 0x4004
-; GFX12-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo:
@@ -3064,18 +3112,18 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
 ;
 ; GFX12-PAL-LABEL: store_load_sindex_large_offset_foo:
 ; GFX12-PAL:       ; %bb.0: ; %bb
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX12-PAL-NEXT:    s_addk_co_i32 s0, 0x4004
 ; GFX12-PAL-NEXT:    s_addk_co_i32 s1, 0x4004
-; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
-; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_endpgm
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
@@ -3142,13 +3190,13 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX12-LABEL: store_load_vindex_large_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-NEXT:    scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_sub_nc_u32_e32 v2, 0x4004, v0
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:16388 th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:124 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:16388 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel:
@@ -3250,13 +3298,13 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX12-PAL-LABEL: store_load_vindex_large_offset_kernel:
 ; GFX12-PAL:       ; %bb.0: ; %bb
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x4004, v0
-; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:16388 th:TH_STORE_NT_RT
-; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:16388 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_endpgm
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
@@ -3326,17 +3374,21 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ;
 ; GFX12-LABEL: store_load_vindex_large_offset_foo:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
 ; GFX12-NEXT:    s_add_co_i32 s0, s32, 0x4000
-; GFX12-NEXT:    scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX12-NEXT:    scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 offset:16384 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b32 v0, v2, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo:
@@ -3409,17 +3461,21 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ;
 ; GFX12-PAL-LABEL: store_load_vindex_large_offset_foo:
 ; GFX12-PAL:       ; %bb.0: ; %bb
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-PAL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-PAL-NEXT:    s_wait_expcnt 0x0
+; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
 ; GFX12-PAL-NEXT:    s_add_co_i32 s0, s32, 0x4000
-; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX12-PAL-NEXT:    scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT
-; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:16384 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v2, off scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
@@ -3485,12 +3541,12 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX12-LABEL: store_load_large_imm_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX12-NEXT:    scratch_store_b32 off, v0, off offset:4 th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_store_b32 off, v1, off offset:16004 th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:16004 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b32 off, v0, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b32 off, v1, off offset:16004 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:16004 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel:
@@ -3590,12 +3646,12 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX12-PAL-LABEL: store_load_large_imm_offset_kernel:
 ; GFX12-PAL:       ; %bb.0: ; %bb
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, off offset:4 th:TH_STORE_NT_RT
-; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-PAL-NEXT:    scratch_store_b32 off, v1, off offset:16004 th:TH_STORE_NT_RT
-; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:16004 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, off offset:4 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
+; GFX12-PAL-NEXT:    scratch_store_b32 off, v1, off offset:16004 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:16004 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_endpgm
 bb:
   %i = alloca [4096 x i32], align 4, addrspace(5)
@@ -3656,14 +3712,18 @@ define void @store_load_large_imm_offset_foo() {
 ;
 ; GFX12-LABEL: store_load_large_imm_offset_foo:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX12-NEXT:    scratch_store_b32 off, v0, s32 th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_store_b32 off, v1, s32 offset:16000 th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, off, s32 offset:16000 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b32 off, v0, s32 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b32 off, v1, s32 offset:16000 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, off, s32 offset:16000 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: store_load_large_imm_offset_foo:
@@ -3727,14 +3787,18 @@ define void @store_load_large_imm_offset_foo() {
 ;
 ; GFX12-PAL-LABEL: store_load_large_imm_offset_foo:
 ; GFX12-PAL:       ; %bb.0: ; %bb
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-PAL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-PAL-NEXT:    s_wait_expcnt 0x0
+; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s32 th:TH_STORE_NT_RT
-; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-PAL-NEXT:    scratch_store_b32 off, v1, s32 offset:16000 th:TH_STORE_NT_RT
-; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s32 offset:16000 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s32 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
+; GFX12-PAL-NEXT:    scratch_store_b32 off, v1, s32 offset:16000 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s32 offset:16000 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = alloca [4096 x i32], align 4, addrspace(5)
@@ -3797,14 +3861,14 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX12-LABEL: store_load_vidx_sidx_offset:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:1024 th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v0, off offset:1024 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b32 v0, v0, off offset:1024 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: store_load_vidx_sidx_offset:
@@ -3879,14 +3943,14 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX12-PAL-LABEL: store_load_vidx_sidx_offset:
 ; GFX12-PAL:       ; %bb.0: ; %bb
 ; GFX12-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX12-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0
 ; GFX12-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
-; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:1024 th:TH_STORE_NT_RT
-; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, v0, off offset:1024 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, v0, off offset:1024 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_endpgm
 bb:
   %alloca = alloca [32 x i32], align 4, addrspace(5)
@@ -3934,12 +3998,16 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
 ;
 ; GFX12-LABEL: store_load_i64_aligned:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
-; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: store_load_i64_aligned:
@@ -3987,12 +4055,16 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
 ;
 ; GFX12-PAL-LABEL: store_load_i64_aligned:
 ; GFX12-PAL:       ; %bb.0: ; %bb
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-PAL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-PAL-NEXT:    s_wait_expcnt 0x0
+; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
-; GFX12-PAL-NEXT:    scratch_store_b64 v0, v[1:2], off th:TH_STORE_NT_RT
-; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-PAL-NEXT:    scratch_load_b64 v[0:1], v0, off th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
+; GFX12-PAL-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile i64 15, ptr addrspace(5) %arg, align 8
@@ -4035,12 +4107,16 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ;
 ; GFX12-LABEL: store_load_i64_unaligned:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
-; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: store_load_i64_unaligned:
@@ -4088,12 +4164,16 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ;
 ; GFX12-PAL-LABEL: store_load_i64_unaligned:
 ; GFX12-PAL:       ; %bb.0: ; %bb
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-PAL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-PAL-NEXT:    s_wait_expcnt 0x0
+; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
-; GFX12-PAL-NEXT:    scratch_store_b64 v0, v[1:2], off th:TH_STORE_NT_RT
-; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-PAL-NEXT:    scratch_load_b64 v[0:1], v0, off th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
+; GFX12-PAL-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile i64 15, ptr addrspace(5) %arg, align 1
@@ -4139,13 +4219,17 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ;
 ; GFX12-LABEL: store_load_v3i32_unaligned:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX12-NEXT:    v_mov_b32_e32 v3, 3
-; GFX12-NEXT:    scratch_store_b96 v0, v[1:3], off th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b96 v[0:2], v0, off th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: store_load_v3i32_unaligned:
@@ -4197,13 +4281,17 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ;
 ; GFX12-PAL-LABEL: store_load_v3i32_unaligned:
 ; GFX12-PAL:       ; %bb.0: ; %bb
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-PAL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-PAL-NEXT:    s_wait_expcnt 0x0
+; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v3, 3
-; GFX12-PAL-NEXT:    scratch_store_b96 v0, v[1:3], off th:TH_STORE_NT_RT
-; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-PAL-NEXT:    scratch_load_b96 v[0:2], v0, off th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
+; GFX12-PAL-NEXT:    scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile <3 x i32> <i32 1, i32 2, i32 3>, ptr addrspace(5) %arg, align 1
@@ -4251,13 +4339,17 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ;
 ; GFX12-LABEL: store_load_v4i32_unaligned:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX12-NEXT:    v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4
-; GFX12-NEXT:    scratch_store_b128 v0, v[1:4], off th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b128 v[0:3], v0, off th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: store_load_v4i32_unaligned:
@@ -4312,13 +4404,17 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ;
 ; GFX12-PAL-LABEL: store_load_v4i32_unaligned:
 ; GFX12-PAL:       ; %bb.0: ; %bb
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-PAL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-PAL-NEXT:    s_wait_expcnt 0x0
+; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4
-; GFX12-PAL-NEXT:    scratch_store_b128 v0, v[1:4], off th:TH_STORE_NT_RT
-; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-PAL-NEXT:    scratch_load_b128 v[0:3], v0, off th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
+; GFX12-PAL-NEXT:    scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %arg, align 1
@@ -4360,12 +4456,16 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
 ;
 ; GFX12-LABEL: store_load_i32_negative_unaligned:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 1
-; GFX12-NEXT:    scratch_store_b8 v0, v1, off offset:-1 th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:-1 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:-1 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: store_load_i32_negative_unaligned:
@@ -4423,12 +4523,16 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
 ;
 ; GFX12-PAL-LABEL: store_load_i32_negative_unaligned:
 ; GFX12-PAL:       ; %bb.0: ; %bb
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-PAL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-PAL-NEXT:    s_wait_expcnt 0x0
+; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v1, 1
-; GFX12-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-1 th:TH_STORE_NT_RT
-; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-1 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
+; GFX12-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-1 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %ptr = getelementptr inbounds i8, ptr addrspace(5) %arg, i32 -1
@@ -4472,12 +4576,16 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
 ;
 ; GFX12-LABEL: store_load_i32_large_negative_unaligned:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 1
-; GFX12-NEXT:    scratch_store_b8 v0, v1, off offset:-4225 th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:-4225 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b8 v0, v1, off offset:-4225 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: store_load_i32_large_negative_unaligned:
@@ -4536,12 +4644,16 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
 ;
 ; GFX12-PAL-LABEL: store_load_i32_large_negative_unaligned:
 ; GFX12-PAL:       ; %bb.0: ; %bb
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-PAL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-PAL-NEXT:    s_wait_expcnt 0x0
+; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v1, 1
-; GFX12-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-4225 th:TH_STORE_NT_RT
-; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-4225 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-4225 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
+; GFX12-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %ptr = getelementptr inbounds i8, ptr addrspace(5) %arg, i32 -4225
@@ -4626,10 +4738,10 @@ define amdgpu_ps void @large_offset() {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v0
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:3024 th:TH_STORE_NT_RT
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b128 v[0:3], off, off offset:3024 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:3024 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_b128 v[0:3], off, off offset:3024 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 16 :: v_dual_mov_b32 v1, 0x810
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use v0
@@ -4744,10 +4856,10 @@ define amdgpu_ps void @large_offset() {
 ; GFX12-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v3, v0
-; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:3024 th:TH_STORE_NT_RT
-; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-PAL-NEXT:    scratch_load_b128 v[0:3], off, off offset:3024 th:TH_LOAD_RT_NT
-; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:3024 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
+; GFX12-PAL-NEXT:    scratch_load_b128 v[0:3], off, off offset:3024 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, 16 :: v_dual_mov_b32 v1, 0x810
 ; GFX12-PAL-NEXT:    ;;#ASMSTART
 ; GFX12-PAL-NEXT:    ; use v0
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index d7f780e414caed..524632cb2db336 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -37,12 +37,11 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_add_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[2:3] offset:32
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -95,11 +94,11 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -155,15 +154,14 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_add_u64 v[2:3], v[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -219,14 +217,14 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-LABEL: atomic_add_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -269,12 +267,11 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_add_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[2:3]
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -322,11 +319,11 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -377,15 +374,14 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_add_u64 v[2:3], v[0:1]
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -436,14 +432,14 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-LABEL: atomic_add_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -489,12 +485,11 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_and_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[2:3] offset:32
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -547,11 +542,11 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -607,15 +602,14 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_and_b64 v[2:3], v[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -671,14 +665,14 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-LABEL: atomic_and_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -721,12 +715,11 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_and_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[2:3]
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -774,11 +767,11 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -829,15 +822,14 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_and_b64 v[2:3], v[0:1]
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -888,14 +880,14 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-LABEL: atomic_and_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -941,12 +933,11 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_sub_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[2:3] offset:32
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -999,11 +990,11 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -1059,15 +1050,14 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_sub_u64 v[2:3], v[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -1123,14 +1113,14 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -1173,12 +1163,11 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_sub_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[2:3]
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -1226,11 +1215,11 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -1281,15 +1270,14 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_sub_u64 v[2:3], v[0:1]
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -1340,14 +1328,14 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-LABEL: atomic_sub_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -1391,12 +1379,11 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_max_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[2:3] offset:32
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -1449,11 +1436,11 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -1507,15 +1494,14 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_max_i64 v[2:3], v[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -1571,14 +1557,14 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-LABEL: atomic_max_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -1619,12 +1605,11 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_max_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[2:3]
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -1672,11 +1657,11 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -1725,15 +1710,14 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_max_i64 v[2:3], v[0:1]
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -1784,14 +1768,14 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-LABEL: atomic_max_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -1835,12 +1819,11 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_umax_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[2:3] offset:32
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -1893,11 +1876,11 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -1951,15 +1934,14 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_max_u64 v[2:3], v[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -2015,14 +1997,14 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -2063,12 +2045,11 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_umax_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[2:3]
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -2116,11 +2097,11 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -2169,15 +2150,14 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_max_u64 v[2:3], v[0:1]
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -2228,14 +2208,14 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GFX12-LABEL: atomic_umax_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -2279,12 +2259,11 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_min_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[2:3] offset:32
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -2337,11 +2316,11 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -2395,15 +2374,14 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_min_i64 v[2:3], v[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -2459,14 +2437,14 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-LABEL: atomic_min_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -2507,12 +2485,11 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_min_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[2:3]
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -2560,11 +2537,11 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -2613,15 +2590,14 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_min_i64 v[2:3], v[0:1]
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -2672,14 +2648,14 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-LABEL: atomic_min_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -2723,12 +2699,11 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_umin_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[2:3] offset:32
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -2781,11 +2756,11 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -2839,15 +2814,14 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_min_u64 v[2:3], v[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -2903,14 +2877,14 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -2951,12 +2925,11 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_umin_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[2:3]
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -3004,11 +2977,11 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -3057,15 +3030,14 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_min_u64 v[2:3], v[0:1]
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -3116,14 +3088,14 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GFX12-LABEL: atomic_umin_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -3169,12 +3141,11 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_or_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[2:3] offset:32
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -3227,11 +3198,11 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -3287,15 +3258,14 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_or_b64 v[2:3], v[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -3351,14 +3321,14 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-LABEL: atomic_or_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -3401,12 +3371,11 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_or_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[2:3]
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -3454,11 +3423,11 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -3509,15 +3478,14 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_or_b64 v[2:3], v[0:1]
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -3568,14 +3536,14 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in
 ; GFX12-LABEL: atomic_or_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -3621,12 +3589,11 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_xchg_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3] offset:32
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -3669,12 +3636,11 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
 ; GFX12-LABEL: atomic_xchg_f64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3] offset:32
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -3717,12 +3683,11 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
 ; GFX12-LABEL: atomic_xchg_pointer_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3] offset:32
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -3775,11 +3740,11 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -3835,15 +3800,14 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[2:3], v[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -3899,14 +3863,14 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -3949,12 +3913,11 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_xchg_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3]
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -4002,11 +3965,11 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -4057,15 +4020,14 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[2:3], v[0:1]
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -4116,14 +4078,14 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GFX12-LABEL: atomic_xchg_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -4169,12 +4131,11 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_xor_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[2:3] offset:32
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -4227,11 +4188,11 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -4287,15 +4248,14 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_xor_b64 v[2:3], v[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -4351,14 +4311,14 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -4401,12 +4361,11 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_xor_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[2:3]
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -4454,11 +4413,11 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -4509,15 +4468,14 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_xor_b64 v[2:3], v[0:1]
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -4568,14 +4526,14 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-LABEL: atomic_xor_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -4623,11 +4581,11 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) {
 ; GFX12-LABEL: atomic_load_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -4670,11 +4628,11 @@ define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) {
 ; GFX12-LABEL: atomic_load_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -4730,14 +4688,14 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -4791,14 +4749,14 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index)
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -4839,7 +4797,7 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) {
 ; GFX12-LABEL: atomic_store_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1] offset:32
@@ -4876,7 +4834,7 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) {
 ; GFX12-LABEL: atomic_store_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -4926,7 +4884,7 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[4:5]
@@ -4977,7 +4935,7 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[4:5]
@@ -5033,13 +4991,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -5090,13 +5047,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:72000
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -5149,12 +5105,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6
 ; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -5211,15 +5167,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i
 ; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
 ; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -5283,14 +5238,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x44
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[2:3], s[10:11], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[4:5], s[2:3]
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
 ; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -5342,13 +5297,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) {
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:3]
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -5396,12 +5350,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in,
 ; GFX12-LABEL: atomic_cmpxchg_i64_ret:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -5453,15 +5407,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind
 ; GFX12-LABEL: atomic_cmpxchg_i64_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
 ; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:3]
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -5520,14 +5473,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x44
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[2:3], s[10:11], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[4:5], s[2:3]
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
 ; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -5576,11 +5529,11 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) {
 ; GFX12-LABEL: atomic_load_f64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -5623,11 +5576,11 @@ define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) {
 ; GFX12-LABEL: atomic_load_f64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -5683,14 +5636,14 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -5744,14 +5697,14 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index)
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -5792,7 +5745,7 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) {
 ; GFX12-LABEL: atomic_store_f64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1] offset:32
@@ -5829,7 +5782,7 @@ define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) {
 ; GFX12-LABEL: atomic_store_f64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -5879,7 +5832,7 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out,
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[4:5]
@@ -5930,7 +5883,7 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[4:5]
@@ -5978,12 +5931,11 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_inc_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3] offset:32
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -6036,11 +5988,11 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -6096,15 +6048,14 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[2:3], v[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -6160,14 +6111,14 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2,
 ; GFX12-LABEL: atomic_inc_i64_ret_incr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -6210,12 +6161,11 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_inc_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3]
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -6263,11 +6213,11 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -6318,15 +6268,14 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[2:3], v[0:1]
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -6377,14 +6326,14 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-LABEL: atomic_inc_i64_ret_incr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -6430,12 +6379,11 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_dec_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3] offset:32
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -6488,11 +6436,11 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -6548,15 +6496,14 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_dec_u64 v[2:3], v[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -6612,14 +6559,14 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
 ; GFX12-LABEL: atomic_dec_i64_ret_decr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -6662,12 +6609,11 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
 ; GFX12-LABEL: atomic_dec_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3]
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -6715,11 +6661,11 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -6770,15 +6716,14 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_dec_u64 v[2:3], v[0:1]
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -6829,14 +6774,14 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-LABEL: atomic_dec_i64_ret_decr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
index 4fe2a4ad2a2fba..4c716444604618 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
@@ -253,11 +253,11 @@ define amdgpu_kernel void @fmaximumi_f32_move_to_valu(ptr addrspace(1) %out, ptr
 ; GCN-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_b32 v1, v0, s[6:7] th:TH_LOAD_RT_NT
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_load_b32 v2, v0, s[0:1] th:TH_LOAD_RT_NT
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    global_load_b32 v2, v0, s[0:1] scope:SCOPE_SYS
+; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    v_maximum_f32 v1, v1, v2
 ; GCN-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GCN-NEXT:    s_nop 0
@@ -277,11 +277,11 @@ define amdgpu_kernel void @fmaximum_f16_move_to_valu(ptr addrspace(1) %out, ptr
 ; GCN-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_u16 v1, v0, s[6:7] th:TH_LOAD_RT_NT
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_load_u16 v2, v0, s[0:1] th:TH_LOAD_RT_NT
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    global_load_u16 v1, v0, s[6:7] scope:SCOPE_SYS
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    global_load_u16 v2, v0, s[0:1] scope:SCOPE_SYS
+; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    v_maximum_f16 v1, v1, v2
 ; GCN-NEXT:    global_store_b16 v0, v1, s[4:5]
 ; GCN-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll
index b63a4fa40b5913..5d1b8b7ad0512f 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll
@@ -253,11 +253,11 @@ define amdgpu_kernel void @fminimumi_f32_move_to_valu(ptr addrspace(1) %out, ptr
 ; GCN-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_b32 v1, v0, s[6:7] th:TH_LOAD_RT_NT
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_load_b32 v2, v0, s[0:1] th:TH_LOAD_RT_NT
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    global_load_b32 v2, v0, s[0:1] scope:SCOPE_SYS
+; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    v_minimum_f32 v1, v1, v2
 ; GCN-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GCN-NEXT:    s_nop 0
@@ -277,11 +277,11 @@ define amdgpu_kernel void @fminimum_f16_move_to_valu(ptr addrspace(1) %out, ptr
 ; GCN-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_u16 v1, v0, s[6:7] th:TH_LOAD_RT_NT
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_load_u16 v2, v0, s[0:1] th:TH_LOAD_RT_NT
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    global_load_u16 v1, v0, s[6:7] scope:SCOPE_SYS
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    global_load_u16 v2, v0, s[0:1] scope:SCOPE_SYS
+; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    v_minimum_f16 v1, v1, v2
 ; GCN-NEXT:    global_store_b16 v0, v1, s[4:5]
 ; GCN-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-num-flat-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-num-flat-atomics.ll
index 5c3e83c4b9cfe1..48e8d87129c590 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-num-flat-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-num-flat-atomics.ll
@@ -27,9 +27,9 @@ define amdgpu_cs float @flat_atomic_fmin_num_f32_rtn(ptr %ptr, float %data, ptr
 ; GFX12-LABEL: flat_atomic_fmin_num_f32_rtn:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[3:4], v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %ret = call float @llvm.amdgcn.flat.atomic.fmin.num.f32.p1.f32(ptr %ptr, float %data)
   store float %ret, ptr %out
@@ -40,9 +40,9 @@ define amdgpu_cs float @flat_atomic_fmax_num_f32_rtn(ptr %ptr, float %data, ptr
 ; GFX12-LABEL: flat_atomic_fmax_num_f32_rtn:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[3:4], v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %ret = call float @llvm.amdgcn.flat.atomic.fmax.num.f32.p1.f32(ptr %ptr, float %data)
   store float %ret, ptr %out
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll
index cf069ee92639f8..69c2c60e4ae2f8 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll
@@ -31,7 +31,7 @@ define amdgpu_cs void @global_atomic_fmax_num_f32_rtn(ptr addrspace(1) %ptr, flo
 ; GFX12-LABEL: global_atomic_fmax_num_f32_rtn:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -45,7 +45,7 @@ define amdgpu_cs void @global_atomic_fmin_num_f32_rtn(ptr addrspace(1) %ptr, flo
 ; GFX12-LABEL: global_atomic_fmin_num_f32_rtn:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index b2b3f3e1bfbd96..2cbc346836212e 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -31,7 +31,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0(ptr addrspace(1) inreg %sb
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %load = load i8, ptr addrspace(1) %sbase
   %zext = zext i8 %load to i32
@@ -66,7 +66,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4095(ptr addrspace(1) inreg
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:4095
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4095
   %load = load i8, ptr addrspace(1) %gep0
@@ -95,7 +95,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4096(ptr addrspace(1) inreg
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:4096
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4096
   %load = load i8, ptr addrspace(1) %gep0
@@ -124,7 +124,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4097(ptr addrspace(1) inreg
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:4097
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4097
   %load = load i8, ptr addrspace(1) %gep0
@@ -161,7 +161,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4096(ptr addrspace(1) inr
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-4096
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4096
   %load = load i8, ptr addrspace(1) %gep0
@@ -202,7 +202,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4097(ptr addrspace(1) inr
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-4097
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4097
   %load = load i8, ptr addrspace(1) %gep0
@@ -243,7 +243,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4098(ptr addrspace(1) inr
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-4098
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4098
   %load = load i8, ptr addrspace(1) %gep0
@@ -279,7 +279,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_2048(ptr addrspace(1) inreg
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:2048
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2048
   %load = load i8, ptr addrspace(1) %gep0
@@ -315,7 +315,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_2049(ptr addrspace(1) inreg
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:2049
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2049
   %load = load i8, ptr addrspace(1) %gep0
@@ -351,7 +351,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_2050(ptr addrspace(1) inreg
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:2050
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2050
   %load = load i8, ptr addrspace(1) %gep0
@@ -380,7 +380,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2048(ptr addrspace(1) inr
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-2048
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2048
   %load = load i8, ptr addrspace(1) %gep0
@@ -417,7 +417,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2049(ptr addrspace(1) inr
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-2049
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2049
   %load = load i8, ptr addrspace(1) %gep0
@@ -454,7 +454,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2050(ptr addrspace(1) inr
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-2050
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2050
   %load = load i8, ptr addrspace(1) %gep0
@@ -489,7 +489,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x7FFFFF(ptr addrspace(1) in
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:8388607
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 8388607
   %load = load i8, ptr addrspace(1) %gep0
@@ -529,7 +529,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0xFFFFFF(ptr addrspace(1) in
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-8388608
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -8388608
   %load = load i8, ptr addrspace(1) %gep0
@@ -564,7 +564,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0xFFFFFFFF(ptr addrspace(1)
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0xff800000
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:8388607
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967295
   %load = load i8, ptr addrspace(1) %gep0
@@ -606,7 +606,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100000000(ptr addrspace(1)
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_0x100000000:
@@ -616,7 +616,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100000000(ptr addrspace(1)
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967296
   %load = load i8, ptr addrspace(1) %gep0
@@ -658,7 +658,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100000001(ptr addrspace(1)
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:1
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_0x100000001:
@@ -668,7 +668,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100000001(ptr addrspace(1)
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967297
   %load = load i8, ptr addrspace(1) %gep0
@@ -710,7 +710,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100000FFF(ptr addrspace(1)
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_0x100000FFF:
@@ -720,7 +720,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100000FFF(ptr addrspace(1)
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294971391
   %load = load i8, ptr addrspace(1) %gep0
@@ -762,7 +762,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100001000(ptr addrspace(1)
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4096
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_0x100001000:
@@ -772,7 +772,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100001000(ptr addrspace(1)
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294971392
   %load = load i8, ptr addrspace(1) %gep0
@@ -815,7 +815,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg0xFFFFFFFF(ptr addrspace(
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-8388607
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg0xFFFFFFFF:
@@ -825,7 +825,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg0xFFFFFFFF(ptr addrspace(
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967295
   %load = load i8, ptr addrspace(1) %gep0
@@ -867,7 +867,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg0x100000000(ptr addrspace
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg0x100000000:
@@ -877,7 +877,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg0x100000000(ptr addrspace
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967296
   %load = load i8, ptr addrspace(1) %gep0
@@ -919,7 +919,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg0x100000001(ptr addrspace
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-1
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg0x100000001:
@@ -929,7 +929,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg0x100000001(ptr addrspace
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967297
   %load = load i8, ptr addrspace(1) %gep0
@@ -959,7 +959,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr(ptr addrspace(1) inreg %s
 ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -996,7 +996,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095(ptr addrspace
 ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:4095
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1045,7 +1045,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4096(ptr addrspace
 ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:4096
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1083,7 +1083,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4096(ptr addrsp
 ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-4096
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1132,7 +1132,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4097(ptr addrsp
 ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-4097
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1160,7 +1160,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2047(ptr addrspace
 ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:2047
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1198,7 +1198,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2048(ptr addrspace
 ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:2048
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1226,7 +1226,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2048(ptr addrsp
 ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-2048
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1264,7 +1264,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2049(ptr addrsp
 ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-2049
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1313,7 +1313,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_0x7FFFFF(ptr addrs
 ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_0x7FFFFF:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:8388607
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 8388607
@@ -1361,7 +1361,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_0xFFFFFF(ptr addrs
 ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_0xFFFFFF:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-8388608
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -8388608
@@ -1399,7 +1399,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095_gep_order(ptr
 ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:4095
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4095
@@ -1427,7 +1427,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint(ptr addrspace(1)
 ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
@@ -1456,7 +1456,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add(ptr
 ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
@@ -1485,7 +1485,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_
 ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
@@ -1515,7 +1515,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_
 ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
@@ -1574,22 +1574,22 @@ define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) {
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-SDAG-NEXT:    ds_load_b64 v[1:2], v1
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_dscnt 0x0
 ; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v2
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_uniform_ptr_in_vgprs:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    ds_load_b64 v[1:2], v1
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_dscnt 0x0
 ; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc, v1, v0
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
   %zext.offset = zext i32 %voffset to i64
@@ -1640,22 +1640,22 @@ define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %vo
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-SDAG-NEXT:    ds_load_b64 v[1:2], v1
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_dscnt 0x0
 ; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v2
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] offset:42
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    ds_load_b64 v[1:2], v1
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_dscnt 0x0
 ; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc, v1, v0
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:42
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
   %zext.offset = zext i32 %voffset to i64
@@ -1687,7 +1687,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset(ptr addrspace(1
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %soffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1717,7 +1717,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset_immoffset(ptr a
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-24
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %soffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1748,7 +1748,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %soffset to i64
   %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
@@ -1780,7 +1780,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %soffset to i64
   %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
@@ -1824,7 +1824,7 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32(ptr addrspace(1) %vbase, i3
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc, v0, s2
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_i8_vgpr64_sgpr32:
@@ -1836,7 +1836,7 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32(ptr addrspace(1) %vbase, i3
 ; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc, v0, v2
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %soffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %vbase, i64 %zext.offset
@@ -1879,7 +1879,7 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(ptr addrspace(1
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc, v0, s2
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095:
@@ -1891,7 +1891,7 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(ptr addrspace(1
 ; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc, v0, v2
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %soffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %vbase, i64 %zext.offset
@@ -1951,14 +1951,14 @@ define amdgpu_ps float @global_load_saddr_f32_natural_addressing(ptr addrspace(1
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc, s2, v0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
 ; GFX12-SDAG-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_f32_natural_addressing:
@@ -1967,14 +1967,14 @@ define amdgpu_ps float @global_load_saddr_f32_natural_addressing(ptr addrspace(1
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
 ; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc, v2, v0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v3, v1, vcc
 ; GFX12-GISEL-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %voffset = load i32, ptr addrspace(1) %voffset.ptr
   %zext.offset = zext i32 %voffset to i64
@@ -2004,9 +2004,9 @@ define amdgpu_ps float @global_load_saddr_f32_natural_addressing_immoffset(ptr a
 ; GFX12-LABEL: global_load_saddr_f32_natural_addressing_immoffset:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] offset:128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %voffset = load i32, ptr addrspace(1) %voffset.ptr
   %zext.offset = zext i32 %voffset to i64
@@ -2039,10 +2039,10 @@ define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range(ptr addrspace(1) i
 ; GFX12-LABEL: global_load_f32_saddr_zext_vgpr_range:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !0, !noundef !{}
   %zext.offset = zext i32 %voffset to i64
@@ -2074,10 +2074,10 @@ define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_imm_offset(ptr add
 ; GFX12-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] offset:400
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !0, !noundef !{}
   %zext.offset = zext i32 %voffset to i64
@@ -2132,14 +2132,14 @@ define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(ptr addr
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc, s2, v0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
 ; GFX12-SDAG-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large:
@@ -2148,14 +2148,14 @@ define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(ptr addr
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
 ; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc, v2, v0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc, v3, v1, vcc
 ; GFX12-GISEL-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !1, !noundef !{}
   %zext.offset = zext i32 %voffset to i64
@@ -2184,7 +2184,7 @@ define amdgpu_ps half @global_load_saddr_i16(ptr addrspace(1) inreg %sbase, i32
 ; GFX12-LABEL: global_load_saddr_i16:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_u16 v0, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2209,7 +2209,7 @@ define amdgpu_ps half @global_load_saddr_i16_immneg128(ptr addrspace(1) inreg %s
 ; GFX12-LABEL: global_load_saddr_i16_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2235,7 +2235,7 @@ define amdgpu_ps half @global_load_saddr_f16(ptr addrspace(1) inreg %sbase, i32
 ; GFX12-LABEL: global_load_saddr_f16:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_u16 v0, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2259,7 +2259,7 @@ define amdgpu_ps half @global_load_saddr_f16_immneg128(ptr addrspace(1) inreg %s
 ; GFX12-LABEL: global_load_saddr_f16_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2284,7 +2284,7 @@ define amdgpu_ps float @global_load_saddr_i32(ptr addrspace(1) inreg %sbase, i32
 ; GFX12-LABEL: global_load_saddr_i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2309,7 +2309,7 @@ define amdgpu_ps float @global_load_saddr_i32_immneg128(ptr addrspace(1) inreg %
 ; GFX12-LABEL: global_load_saddr_i32_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2335,7 +2335,7 @@ define amdgpu_ps float @global_load_saddr_f32(ptr addrspace(1) inreg %sbase, i32
 ; GFX12-LABEL: global_load_saddr_f32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2359,7 +2359,7 @@ define amdgpu_ps float @global_load_saddr_f32_immneg128(ptr addrspace(1) inreg %
 ; GFX12-LABEL: global_load_saddr_f32_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2384,7 +2384,7 @@ define amdgpu_ps <2 x half> @global_load_saddr_v2i16(ptr addrspace(1) inreg %sba
 ; GFX12-LABEL: global_load_saddr_v2i16:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2409,7 +2409,7 @@ define amdgpu_ps <2 x half> @global_load_saddr_v2i16_immneg128(ptr addrspace(1)
 ; GFX12-LABEL: global_load_saddr_v2i16_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2435,7 +2435,7 @@ define amdgpu_ps <2 x half> @global_load_saddr_v2f16(ptr addrspace(1) inreg %sba
 ; GFX12-LABEL: global_load_saddr_v2f16:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2459,7 +2459,7 @@ define amdgpu_ps <2 x half> @global_load_saddr_v2f16_immneg128(ptr addrspace(1)
 ; GFX12-LABEL: global_load_saddr_v2f16_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2484,7 +2484,7 @@ define amdgpu_ps <2 x half> @global_load_saddr_p3(ptr addrspace(1) inreg %sbase,
 ; GFX12-LABEL: global_load_saddr_p3:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2510,7 +2510,7 @@ define amdgpu_ps <2 x half> @global_load_saddr_p3_immneg128(ptr addrspace(1) inr
 ; GFX12-LABEL: global_load_saddr_p3_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2537,7 +2537,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_f64(ptr addrspace(1) inreg %sbas
 ; GFX12-LABEL: global_load_saddr_f64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2562,7 +2562,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_f64_immneg128(ptr addrspace(1) i
 ; GFX12-LABEL: global_load_saddr_f64_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2588,7 +2588,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_i64(ptr addrspace(1) inreg %sbas
 ; GFX12-LABEL: global_load_saddr_i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2613,7 +2613,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_i64_immneg128(ptr addrspace(1) i
 ; GFX12-LABEL: global_load_saddr_i64_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2639,7 +2639,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_v2f32(ptr addrspace(1) inreg %sb
 ; GFX12-LABEL: global_load_saddr_v2f32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2663,7 +2663,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_v2f32_immneg128(ptr addrspace(1)
 ; GFX12-LABEL: global_load_saddr_v2f32_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2688,7 +2688,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_v2i32(ptr addrspace(1) inreg %sb
 ; GFX12-LABEL: global_load_saddr_v2i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2713,7 +2713,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_v2i32_immneg128(ptr addrspace(1)
 ; GFX12-LABEL: global_load_saddr_v2i32_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2739,7 +2739,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_v4i16(ptr addrspace(1) inreg %sb
 ; GFX12-LABEL: global_load_saddr_v4i16:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2764,7 +2764,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_v4i16_immneg128(ptr addrspace(1)
 ; GFX12-LABEL: global_load_saddr_v4i16_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2790,7 +2790,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_v4f16(ptr addrspace(1) inreg %sb
 ; GFX12-LABEL: global_load_saddr_v4f16:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2815,7 +2815,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_v4f16_immneg128(ptr addrspace(1)
 ; GFX12-LABEL: global_load_saddr_v4f16_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2841,7 +2841,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_p1(ptr addrspace(1) inreg %sbase
 ; GFX12-LABEL: global_load_saddr_p1:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2867,7 +2867,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_p1_immneg128(ptr addrspace(1) in
 ; GFX12-LABEL: global_load_saddr_p1_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2894,7 +2894,7 @@ define amdgpu_ps <3 x float> @global_load_saddr_v3f32(ptr addrspace(1) inreg %sb
 ; GFX12-LABEL: global_load_saddr_v3f32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b96 v[0:2], v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2918,7 +2918,7 @@ define amdgpu_ps <3 x float> @global_load_saddr_v3f32_immneg128(ptr addrspace(1)
 ; GFX12-LABEL: global_load_saddr_v3f32_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b96 v[0:2], v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2943,7 +2943,7 @@ define amdgpu_ps <3 x float> @global_load_saddr_v3i32(ptr addrspace(1) inreg %sb
 ; GFX12-LABEL: global_load_saddr_v3i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b96 v[0:2], v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2968,7 +2968,7 @@ define amdgpu_ps <3 x float> @global_load_saddr_v3i32_immneg128(ptr addrspace(1)
 ; GFX12-LABEL: global_load_saddr_v3i32_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b96 v[0:2], v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2994,7 +2994,7 @@ define amdgpu_ps <6 x half> @global_load_saddr_v6f16(ptr addrspace(1) inreg %sba
 ; GFX12-LABEL: global_load_saddr_v6f16:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b96 v[0:2], v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3018,7 +3018,7 @@ define amdgpu_ps <6 x half> @global_load_saddr_v6f16_immneg128(ptr addrspace(1)
 ; GFX12-LABEL: global_load_saddr_v6f16_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b96 v[0:2], v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3043,7 +3043,7 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4f32(ptr addrspace(1) inreg %sb
 ; GFX12-LABEL: global_load_saddr_v4f32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3067,7 +3067,7 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4f32_immneg128(ptr addrspace(1)
 ; GFX12-LABEL: global_load_saddr_v4f32_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b128 v[0:3], v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3092,7 +3092,7 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4i32(ptr addrspace(1) inreg %sb
 ; GFX12-LABEL: global_load_saddr_v4i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3117,7 +3117,7 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4i32_immneg128(ptr addrspace(1)
 ; GFX12-LABEL: global_load_saddr_v4i32_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b128 v[0:3], v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3143,7 +3143,7 @@ define amdgpu_ps <4 x float> @global_load_saddr_v2i64(ptr addrspace(1) inreg %sb
 ; GFX12-LABEL: global_load_saddr_v2i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3168,7 +3168,7 @@ define amdgpu_ps <4 x float> @global_load_saddr_v2i64_immneg128(ptr addrspace(1)
 ; GFX12-LABEL: global_load_saddr_v2i64_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b128 v[0:3], v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3194,7 +3194,7 @@ define amdgpu_ps <4 x float> @global_load_saddr_i128(ptr addrspace(1) inreg %sba
 ; GFX12-LABEL: global_load_saddr_i128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3219,7 +3219,7 @@ define amdgpu_ps <4 x float> @global_load_saddr_i128_immneg128(ptr addrspace(1)
 ; GFX12-LABEL: global_load_saddr_i128_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b128 v[0:3], v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3245,7 +3245,7 @@ define amdgpu_ps <4 x float> @global_load_saddr_v2p1(ptr addrspace(1) inreg %sba
 ; GFX12-LABEL: global_load_saddr_v2p1:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3271,7 +3271,7 @@ define amdgpu_ps <4 x float> @global_load_saddr_v2p1_immneg128(ptr addrspace(1)
 ; GFX12-LABEL: global_load_saddr_v2p1_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b128 v[0:3], v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3298,7 +3298,7 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4p3(ptr addrspace(1) inreg %sba
 ; GFX12-LABEL: global_load_saddr_v4p3:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3324,7 +3324,7 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4p3_immneg128(ptr addrspace(1)
 ; GFX12-LABEL: global_load_saddr_v4p3_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b128 v[0:3], v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3355,7 +3355,7 @@ define amdgpu_ps float @global_sextload_saddr_i8(ptr addrspace(1) inreg %sbase,
 ; GFX12-LABEL: global_sextload_saddr_i8:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_i8 v0, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3381,7 +3381,7 @@ define amdgpu_ps float @global_sextload_saddr_i8_immneg128(ptr addrspace(1) inre
 ; GFX12-LABEL: global_sextload_saddr_i8_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_i8 v0, v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3408,7 +3408,7 @@ define amdgpu_ps float @global_sextload_saddr_i16(ptr addrspace(1) inreg %sbase,
 ; GFX12-LABEL: global_sextload_saddr_i16:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_i16 v0, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3434,7 +3434,7 @@ define amdgpu_ps float @global_sextload_saddr_i16_immneg128(ptr addrspace(1) inr
 ; GFX12-LABEL: global_sextload_saddr_i16_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_i16 v0, v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3461,7 +3461,7 @@ define amdgpu_ps float @global_zextload_saddr_i8(ptr addrspace(1) inreg %sbase,
 ; GFX12-LABEL: global_zextload_saddr_i8:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3487,7 +3487,7 @@ define amdgpu_ps float @global_zextload_saddr_i8_immneg128(ptr addrspace(1) inre
 ; GFX12-LABEL: global_zextload_saddr_i8_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3514,7 +3514,7 @@ define amdgpu_ps float @global_zextload_saddr_i16(ptr addrspace(1) inreg %sbase,
 ; GFX12-LABEL: global_zextload_saddr_i16:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_u16 v0, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3540,7 +3540,7 @@ define amdgpu_ps float @global_zextload_saddr_i16_immneg128(ptr addrspace(1) inr
 ; GFX12-LABEL: global_zextload_saddr_i16_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3582,7 +3582,7 @@ define amdgpu_ps float @atomic_global_load_saddr_i32(ptr addrspace(1) inreg %sba
 ; GFX12-LABEL: atomic_global_load_saddr_i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] th:TH_LOAD_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
@@ -3619,7 +3619,7 @@ define amdgpu_ps float @atomic_global_load_saddr_i32_immneg128(ptr addrspace(1)
 ; GFX12-LABEL: atomic_global_load_saddr_i32_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128 th:TH_LOAD_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
@@ -3657,7 +3657,7 @@ define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64(ptr addrspace(1) inre
 ; GFX12-LABEL: atomic_global_load_saddr_i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3] th:TH_LOAD_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
@@ -3694,7 +3694,7 @@ define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64_immneg128(ptr addrspa
 ; GFX12-LABEL: atomic_global_load_saddr_i64_immneg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128 th:TH_LOAD_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
@@ -3725,13 +3725,13 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi(ptr addrspace(
 ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_undef_hi:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_undef_hi:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3]
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3757,13 +3757,13 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi_immneg128(ptr
 ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    global_load_d16_b16 v0, v0, s[2:3] offset:-128
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3795,14 +3795,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi(ptr addrspace(1
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-SDAG-NEXT:    global_load_d16_b16 v1, v0, s[2:3]
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_zero_hi:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3]
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
@@ -3834,14 +3834,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi_immneg128(ptr a
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-SDAG-NEXT:    global_load_d16_b16 v1, v0, s[2:3] offset:-128
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
@@ -3871,14 +3871,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi(ptr addrspace(1)
 ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_reg_hi:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    global_load_d16_b16 v1, v0, s[2:3]
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_reg_hi:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3]
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
@@ -3907,14 +3907,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi_immneg128(ptr ad
 ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    global_load_d16_b16 v1, v0, s[2:3] offset:-128
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
@@ -3944,14 +3944,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi(ptr addrs
 ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    global_load_d16_u8 v1, v0, s[2:3]
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[2:3]
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
@@ -3981,14 +3981,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128
 ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    global_load_d16_u8 v1, v0, s[2:3] offset:-128
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-128
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
@@ -4019,14 +4019,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi(ptr addrs
 ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    global_load_d16_i8 v1, v0, s[2:3]
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    global_load_i8 v0, v0, s[2:3]
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
@@ -4058,14 +4058,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128
 ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    global_load_d16_i8 v1, v0, s[2:3] offset:-128
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    global_load_i8 v0, v0, s[2:3] offset:-128
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
@@ -4100,13 +4100,13 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi(ptr addrspace(
 ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_undef_hi:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    global_load_d16_hi_b16 v0, v0, s[2:3]
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_undef_hi:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3]
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
@@ -4133,13 +4133,13 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi_immneg128(ptr
 ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    global_load_d16_hi_b16 v0, v0, s[2:3] offset:-128
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
@@ -4172,14 +4172,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi(ptr addrspace(1
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-SDAG-NEXT:    global_load_d16_hi_b16 v1, v0, s[2:3]
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_zero_hi:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3]
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
@@ -4211,14 +4211,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi_immneg128(ptr a
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-SDAG-NEXT:    global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
@@ -4248,14 +4248,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi(ptr addrspace(1)
 ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_reg_hi:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    global_load_d16_hi_b16 v1, v0, s[2:3]
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_reg_hi:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3]
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
@@ -4286,14 +4286,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi_immneg128(ptr ad
 ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
@@ -4325,14 +4325,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi(ptr addrs
 ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    global_load_d16_hi_u8 v1, v0, s[2:3]
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[2:3]
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
@@ -4364,14 +4364,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128
 ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    global_load_d16_hi_u8 v1, v0, s[2:3] offset:-128
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-128
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
@@ -4404,14 +4404,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi(ptr addrs
 ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    global_load_d16_hi_i8 v1, v0, s[2:3]
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    global_load_i8 v0, v0, s[2:3]
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -4444,14 +4444,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128
 ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    global_load_d16_hi_i8 v1, v0, s[2:3] offset:-128
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    global_load_i8 v0, v0, s[2:3] offset:-128
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -4494,7 +4494,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_16(ptr add
 ; GFX12-NEXT:    v_or_b32_e32 v0, 16, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.idx = zext i32 %idx to i64
   %or = or i64 %zext.idx, 16
@@ -4527,7 +4527,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr a
 ; GFX12-NEXT:    v_or_b32_e32 v0, 0x1040, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.idx = zext i32 %idx to i64
   %or = or i64 %zext.idx, 4160
@@ -4605,8 +4605,8 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    s_add_nc_u64 s[4:5], s[2:3], s[0:1]
 ; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 4
-; GFX12-SDAG-NEXT:    global_load_b32 v1, v0, s[4:5] th:TH_LOAD_RT_NT
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    s_cmp_eq_u32 s0, 0x400
 ; GFX12-SDAG-NEXT:    s_cbranch_scc0 .LBB132_1
 ; GFX12-SDAG-NEXT:  ; %bb.2: ; %bb2
@@ -4626,8 +4626,8 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX12-GISEL-NEXT:    v_add_co_u32 v2, vcc, v2, 4
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v3, vcc, 0, v3, vcc
-; GFX12-GISEL-NEXT:    global_load_b32 v4, v[4:5], off th:TH_LOAD_RT_NT
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_load_b32 v4, v[4:5], off scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0x400, v2
 ; GFX12-GISEL-NEXT:    s_cbranch_vccz .LBB132_1
 ; GFX12-GISEL-NEXT:  ; %bb.2: ; %bb2
@@ -4721,10 +4721,10 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    s_add_nc_u64 s[4:5], s[2:3], s[0:1]
 ; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 4
-; GFX12-SDAG-NEXT:    global_load_b32 v1, v0, s[4:5] th:TH_LOAD_RT_NT
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-SDAG-NEXT:    global_load_b32 v1, v0, s[4:5] th:TH_LOAD_RT_NT
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    s_cmp_eq_u32 s0, 0x400
 ; GFX12-SDAG-NEXT:    s_cbranch_scc0 .LBB133_1
 ; GFX12-SDAG-NEXT:  ; %bb.2: ; %bb2
@@ -4744,10 +4744,10 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX12-GISEL-NEXT:    v_add_co_u32 v2, vcc, v2, 4
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v3, vcc, 0, v3, vcc
-; GFX12-GISEL-NEXT:    global_load_b32 v6, v[4:5], off th:TH_LOAD_RT_NT
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-GISEL-NEXT:    global_load_b32 v4, v[4:5], off th:TH_LOAD_RT_NT
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_load_b32 v6, v[4:5], off scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_b32 v4, v[4:5], off scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0x400, v2
 ; GFX12-GISEL-NEXT:    s_cbranch_vccz .LBB133_1
 ; GFX12-GISEL-NEXT:  ; %bb.2: ; %bb2
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll
index 2a4a3ad2e3bcaa..1102f9d0f1a5fd 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll
@@ -26,7 +26,7 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr(ptr addrspace(1) inreg %s
 ; GFX12-LABEL: global_store_saddr_i8_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v0, v2, s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -59,7 +59,7 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_2047(ptr addrspace
 ; GFX12-LABEL: global_store_saddr_i8_zext_vgpr_offset_2047:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v0, v2, s[2:3] offset:2047
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -93,7 +93,7 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_neg2048(ptr addrsp
 ; GFX12-LABEL: global_store_saddr_i8_zext_vgpr_offset_neg2048:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v0, v2, s[2:3] offset:-2048
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -151,7 +151,7 @@ define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    ds_load_b64 v[2:3], v2
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    v_readfirstlane_b32 s0, v2
 ; GFX12-NEXT:    v_readfirstlane_b32 s1, v3
 ; GFX12-NEXT:    global_store_b8 v0, v1, s[0:1]
@@ -204,7 +204,7 @@ define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs_immoffset(i32 %vo
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    ds_load_b64 v[2:3], v2
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    v_readfirstlane_b32 s0, v2
 ; GFX12-NEXT:    v_readfirstlane_b32 s1, v3
 ; GFX12-NEXT:    global_store_b8 v0, v1, s[0:1] offset:-120
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
index 325dae172d5236..0413e21b9c215a 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -46,11 +46,11 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX12-LABEL: atomic_add_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_add_u64 v2, v[0:1], s[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -117,11 +117,11 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -193,13 +193,13 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_add_u64 v2, v[0:1], s[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -273,14 +273,14 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX12-LABEL: atomic_add_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -340,11 +340,11 @@ define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX12-LABEL: atomic_add_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_add_u64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -410,11 +410,11 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -483,13 +483,13 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_add_u64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -560,14 +560,14 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-LABEL: atomic_add_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -622,11 +622,11 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX12-LABEL: atomic_and_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_and_b64 v2, v[0:1], s[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -693,11 +693,11 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -769,13 +769,13 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_and_b64 v2, v[0:1], s[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -849,14 +849,14 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX12-LABEL: atomic_and_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -916,11 +916,11 @@ define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX12-LABEL: atomic_and_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_and_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -986,11 +986,11 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -1059,13 +1059,13 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_and_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -1136,14 +1136,14 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-LABEL: atomic_and_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -1198,11 +1198,11 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX12-LABEL: atomic_sub_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_sub_u64 v2, v[0:1], s[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -1269,11 +1269,11 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -1345,13 +1345,13 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_sub_u64 v2, v[0:1], s[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -1425,14 +1425,14 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -1492,11 +1492,11 @@ define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX12-LABEL: atomic_sub_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_sub_u64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -1562,11 +1562,11 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -1635,13 +1635,13 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_sub_u64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -1712,14 +1712,14 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-LABEL: atomic_sub_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -1768,11 +1768,11 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX12-LABEL: atomic_max_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_max_i64 v2, v[0:1], s[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -1836,11 +1836,11 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -1906,13 +1906,13 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_max_i64 v2, v[0:1], s[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -1983,14 +1983,14 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX12-LABEL: atomic_max_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -2044,11 +2044,11 @@ define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX12-LABEL: atomic_max_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_max_i64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -2111,11 +2111,11 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -2178,13 +2178,13 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_max_i64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -2252,14 +2252,14 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-LABEL: atomic_max_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -2308,11 +2308,11 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in
 ; GFX12-LABEL: atomic_umax_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_max_u64 v2, v[0:1], s[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -2376,11 +2376,11 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -2446,13 +2446,13 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out,
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_max_u64 v2, v[0:1], s[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -2523,14 +2523,14 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
 ; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -2584,11 +2584,11 @@ define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX12-LABEL: atomic_umax_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_max_u64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -2651,11 +2651,11 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -2718,13 +2718,13 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_max_u64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -2792,14 +2792,14 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-LABEL: atomic_umax_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -2848,11 +2848,11 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX12-LABEL: atomic_min_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_min_i64 v2, v[0:1], s[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -2916,11 +2916,11 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -2986,13 +2986,13 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_min_i64 v2, v[0:1], s[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -3063,14 +3063,14 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX12-LABEL: atomic_min_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -3124,11 +3124,11 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX12-LABEL: atomic_min_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_min_i64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -3191,11 +3191,11 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -3258,13 +3258,13 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_min_i64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -3332,14 +3332,14 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-LABEL: atomic_min_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -3388,11 +3388,11 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in
 ; GFX12-LABEL: atomic_umin_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_min_u64 v2, v[0:1], s[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -3456,11 +3456,11 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -3526,13 +3526,13 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out,
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_min_u64 v2, v[0:1], s[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -3603,14 +3603,14 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o
 ; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -3664,11 +3664,11 @@ define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX12-LABEL: atomic_umin_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_min_u64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -3731,11 +3731,11 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -3798,13 +3798,13 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_min_u64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -3872,14 +3872,14 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-LABEL: atomic_umin_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -3934,11 +3934,11 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX12-LABEL: atomic_or_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_or_b64 v2, v[0:1], s[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -4005,11 +4005,11 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr a
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -4081,13 +4081,13 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_or_b64 v2, v[0:1], s[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -4161,14 +4161,14 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out
 ; GFX12-LABEL: atomic_or_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -4228,11 +4228,11 @@ define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX12-LABEL: atomic_or_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_or_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -4298,11 +4298,11 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspac
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -4371,13 +4371,13 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_or_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -4448,14 +4448,14 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a
 ; GFX12-LABEL: atomic_or_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -4510,11 +4510,11 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in
 ; GFX12-LABEL: atomic_xchg_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -4565,11 +4565,11 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double
 ; GFX12-LABEL: atomic_xchg_f64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -4620,11 +4620,11 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr
 ; GFX12-LABEL: atomic_xchg_pointer_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -4691,11 +4691,11 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -4767,13 +4767,13 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out,
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -4847,14 +4847,14 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o
 ; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -4914,11 +4914,11 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX12-LABEL: atomic_xchg_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_swap_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -4984,11 +4984,11 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -5057,13 +5057,13 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_swap_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -5134,14 +5134,14 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-LABEL: atomic_xchg_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -5196,11 +5196,11 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX12-LABEL: atomic_xor_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_xor_b64 v2, v[0:1], s[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -5267,11 +5267,11 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -5343,13 +5343,13 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_xor_b64 v2, v[0:1], s[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -5423,14 +5423,14 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -5490,11 +5490,11 @@ define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX12-LABEL: atomic_xor_i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_xor_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -5560,11 +5560,11 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -5633,13 +5633,13 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_xor_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -5710,14 +5710,14 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-LABEL: atomic_xor_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -5787,12 +5787,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr addrspace(1) %out, i64
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] offset:32
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -5860,12 +5860,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr addrspace(1) %out, i64
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] offset:72000
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -5933,12 +5933,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out,
 ; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v4, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -6012,7 +6012,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou
 ; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s6
@@ -6020,7 +6020,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -6105,7 +6105,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1)
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x44
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s0
@@ -6113,7 +6113,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1)
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v4, v[0:1], s[6:7]
 ; GFX12-NEXT:    s_nop 0
@@ -6185,12 +6185,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr addrspace(1) %out, i64 %in, i6
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v4, v[0:3], s[4:5]
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -6257,12 +6257,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add
 ; GFX12-LABEL: atomic_cmpxchg_i64_ret:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v4, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -6333,7 +6333,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64
 ; GFX12-LABEL: atomic_cmpxchg_i64_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s6
@@ -6341,7 +6341,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v4, v[0:3], s[0:1]
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -6423,7 +6423,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out,
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b256 s[4:11], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x44
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s0
@@ -6431,7 +6431,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out,
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v4, v[0:1], s[6:7]
 ; GFX12-NEXT:    s_nop 0
@@ -6495,9 +6495,9 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addr
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -6562,9 +6562,9 @@ define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[0:1] offset:-32 th:TH_LOAD_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -6625,9 +6625,9 @@ define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -6703,12 +6703,12 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -6784,12 +6784,12 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addr
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -6866,12 +6866,12 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
 ; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -6926,7 +6926,7 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %ou
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3] offset:32
 ; GFX12-NEXT:    s_nop 0
@@ -6977,7 +6977,7 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) {
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -7042,7 +7042,7 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -7110,7 +7110,7 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -7179,7 +7179,7 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrsp
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -7237,11 +7237,11 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX12-LABEL: atomic_inc_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -7308,11 +7308,11 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_inc_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -7384,13 +7384,13 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -7442,11 +7442,11 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX12-LABEL: atomic_dec_i64_offset:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -7513,11 +7513,11 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    global_atomic_dec_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-NEXT:    s_nop 0
@@ -7589,13 +7589,13 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], s[0:1]
 ; GFX12-NEXT:    global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32
-; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll
index 6a6c5b33e0dd8f..762722ec3d9230 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll
@@ -8,7 +8,7 @@ define amdgpu_kernel void @global_atomic_ordered_add_b64_no_rtn(ptr addrspace(1)
 ; GFX12-SDAG-LABEL: global_atomic_ordered_add_b64_no_rtn:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
 ; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-SDAG-NEXT:    global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[0:1] offset:-32 th:TH_ATOMIC_RETURN
@@ -18,7 +18,7 @@ define amdgpu_kernel void @global_atomic_ordered_add_b64_no_rtn(ptr addrspace(1)
 ; GFX12-GISEL:       ; %bb.0: ; %entry
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-GISEL-NEXT:    global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[0:1] offset:-32 th:TH_ATOMIC_RETURN
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -34,10 +34,10 @@ define amdgpu_kernel void @global_atomic_ordered_add_b64_rtn(ptr addrspace(1) %a
 ; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6
 ; GFX12-SDAG-NEXT:    global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -49,10 +49,10 @@ define amdgpu_kernel void @global_atomic_ordered_add_b64_rtn(ptr addrspace(1) %a
 ; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX12-GISEL-NEXT:    global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll
index c76f22d7763923..e96d6875558985 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll
@@ -20,7 +20,7 @@ define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords)
 ; GFX12-LABEL: load_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -44,7 +44,7 @@ define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords)
 ; GFX12-LABEL: load_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -69,7 +69,7 @@ define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l
 ; GFX12-LABEL: load_3d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -95,7 +95,7 @@ define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords
 ; GFX12-LABEL: load_cube:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -121,7 +121,7 @@ define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coo
 ; GFX12-LABEL: load_1darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -146,7 +146,7 @@ define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coo
 ; GFX12-LABEL: load_2darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -172,7 +172,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coor
 ; GFX12-LABEL: load_2dmsaa:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -198,7 +198,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16>
 ; GFX12-LABEL: load_2darraymsaa:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -225,7 +225,7 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coor
 ; GFX12-LABEL: load_mip_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load_mip v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -250,7 +250,7 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coor
 ; GFX12-LABEL: load_mip_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load_mip v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -276,7 +276,7 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coor
 ; GFX12-LABEL: load_mip_3d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load_mip v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -303,7 +303,7 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %co
 ; GFX12-LABEL: load_mip_cube:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load_mip v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -330,7 +330,7 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16>
 ; GFX12-LABEL: load_mip_1darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load_mip v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -356,7 +356,7 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16>
 ; GFX12-LABEL: load_mip_2darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load_mip v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -830,7 +830,7 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %co
 ; GFX12-LABEL: getresinfo_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %mip = extractelement <2 x i16> %coords, i32 0
@@ -854,7 +854,7 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %co
 ; GFX12-LABEL: getresinfo_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %mip = extractelement <2 x i16> %coords, i32 0
@@ -878,7 +878,7 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %co
 ; GFX12-LABEL: getresinfo_3d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %mip = extractelement <2 x i16> %coords, i32 0
@@ -902,7 +902,7 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> %
 ; GFX12-LABEL: getresinfo_cube:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %mip = extractelement <2 x i16> %coords, i32 0
@@ -926,7 +926,7 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16
 ; GFX12-LABEL: getresinfo_1darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %mip = extractelement <2 x i16> %coords, i32 0
@@ -950,7 +950,7 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16
 ; GFX12-LABEL: getresinfo_2darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %mip = extractelement <2 x i16> %coords, i32 0
@@ -974,7 +974,7 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16>
 ; GFX12-LABEL: getresinfo_2dmsaa:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %mip = extractelement <2 x i16> %coords, i32 0
@@ -998,7 +998,7 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x
 ; GFX12-LABEL: getresinfo_2darraymsaa:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %mip = extractelement <2 x i16> %coords, i32 0
@@ -1022,7 +1022,7 @@ define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
 ; GFX12-LABEL: load_1d_V1:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v0, v0, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -1046,7 +1046,7 @@ define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coord
 ; GFX12-LABEL: load_1d_V2:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:1], v0, s[0:7] dmask:0x9 dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -1130,7 +1130,7 @@ define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coor
 ; GFX12-LABEL: load_1d_glc:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_LOAD_NT a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -1154,7 +1154,7 @@ define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coor
 ; GFX12-LABEL: load_1d_slc:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_LOAD_HT a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -1178,7 +1178,7 @@ define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> %
 ; GFX12-LABEL: load_1d_glc_slc:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_LOAD_LU a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll
index 493653b6583089..276b506fb4a230 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll
@@ -26,7 +26,7 @@ define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords)
 ; GFX12-LABEL: load_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x40,0x00,0xc0,0xd3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -56,7 +56,7 @@ define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords)
 ; GFX12-LABEL: load_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; encoding: [0x41,0x00,0xc0,0xd3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -87,7 +87,7 @@ define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l
 ; GFX12-LABEL: load_3d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x42,0x00,0xc0,0xd3,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -119,7 +119,7 @@ define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords
 ; GFX12-LABEL: load_cube:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16 ; encoding: [0x43,0x00,0xc0,0xd3,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -151,7 +151,7 @@ define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coo
 ; GFX12-LABEL: load_1darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16 ; encoding: [0x44,0x00,0xc0,0xd3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -182,7 +182,7 @@ define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coo
 ; GFX12-LABEL: load_2darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16 ; encoding: [0x45,0x00,0xc0,0xd3,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -214,7 +214,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coor
 ; GFX12-LABEL: load_2dmsaa:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA a16 ; encoding: [0x46,0x00,0xc0,0xd3,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -246,7 +246,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16>
 ; GFX12-LABEL: load_2darraymsaa:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY a16 ; encoding: [0x47,0x00,0xc0,0xd3,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -279,7 +279,7 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coor
 ; GFX12-LABEL: load_mip_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load_mip v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x40,0x40,0xc0,0xd3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -310,7 +310,7 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coor
 ; GFX12-LABEL: load_mip_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load_mip v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; encoding: [0x41,0x40,0xc0,0xd3,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -342,7 +342,7 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coor
 ; GFX12-LABEL: load_mip_3d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load_mip v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x42,0x40,0xc0,0xd3,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -375,7 +375,7 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %co
 ; GFX12-LABEL: load_mip_cube:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load_mip v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16 ; encoding: [0x43,0x40,0xc0,0xd3,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -408,7 +408,7 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16>
 ; GFX12-LABEL: load_mip_1darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load_mip v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16 ; encoding: [0x44,0x40,0xc0,0xd3,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -440,7 +440,7 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16>
 ; GFX12-LABEL: load_mip_2darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load_mip v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16 ; encoding: [0x45,0x40,0xc0,0xd3,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -920,7 +920,7 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %co
 ; GFX12-LABEL: getresinfo_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x40,0xc0,0xc5,0xd3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %mip = extractelement <2 x i16> %coords, i32 0
@@ -950,7 +950,7 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %co
 ; GFX12-LABEL: getresinfo_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; encoding: [0x41,0xc0,0xc5,0xd3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %mip = extractelement <2 x i16> %coords, i32 0
@@ -980,7 +980,7 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %co
 ; GFX12-LABEL: getresinfo_3d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x42,0xc0,0xc5,0xd3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %mip = extractelement <2 x i16> %coords, i32 0
@@ -1010,7 +1010,7 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> %
 ; GFX12-LABEL: getresinfo_cube:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16 ; encoding: [0x43,0xc0,0xc5,0xd3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %mip = extractelement <2 x i16> %coords, i32 0
@@ -1040,7 +1040,7 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16
 ; GFX12-LABEL: getresinfo_1darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16 ; encoding: [0x44,0xc0,0xc5,0xd3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %mip = extractelement <2 x i16> %coords, i32 0
@@ -1070,7 +1070,7 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16
 ; GFX12-LABEL: getresinfo_2darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16 ; encoding: [0x45,0xc0,0xc5,0xd3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %mip = extractelement <2 x i16> %coords, i32 0
@@ -1100,7 +1100,7 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16>
 ; GFX12-LABEL: getresinfo_2dmsaa:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA a16 ; encoding: [0x46,0xc0,0xc5,0xd3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %mip = extractelement <2 x i16> %coords, i32 0
@@ -1130,7 +1130,7 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x
 ; GFX12-LABEL: getresinfo_2darraymsaa:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY a16 ; encoding: [0x47,0xc0,0xc5,0xd3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %mip = extractelement <2 x i16> %coords, i32 0
@@ -1160,7 +1160,7 @@ define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
 ; GFX12-LABEL: load_1d_V1:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v0, v0, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x40,0x00,0x00,0xd2,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -1190,7 +1190,7 @@ define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coord
 ; GFX12-LABEL: load_1d_V2:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:1], v0, s[0:7] dmask:0x9 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x40,0x00,0x40,0xd2,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -1280,7 +1280,7 @@ define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coor
 ; GFX12-LABEL: load_1d_glc:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_LOAD_NT a16 ; encoding: [0x40,0x00,0xc0,0xd3,0x00,0x00,0x10,0x00,0x00,0x00,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -1310,7 +1310,7 @@ define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coor
 ; GFX12-LABEL: load_1d_slc:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_LOAD_HT a16 ; encoding: [0x40,0x00,0xc0,0xd3,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -1340,7 +1340,7 @@ define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> %
 ; GFX12-LABEL: load_1d_glc_slc:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_LOAD_LU a16 ; encoding: [0x40,0x00,0xc0,0xd3,0x00,0x00,0x30,0x00,0x00,0x00,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
index afa9696b17110b..5b5586f86f0d73 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
@@ -41,7 +41,7 @@ define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-LABEL: load_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -152,7 +152,7 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v10
 ; GFX12-NEXT:    image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v6, v4, s[8:9]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -262,7 +262,7 @@ define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:4], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX12-NEXT:    v_mov_b32_e32 v5, 0
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v5, v4, s[8:9]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -307,7 +307,7 @@ define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
 ; GFX12-LABEL: load_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
@@ -423,7 +423,7 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v11
 ; GFX12-NEXT:    image_load v[0:4], [v6, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v7, v4, s[8:9]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -468,7 +468,7 @@ define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32
 ; GFX12-LABEL: load_3d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], [v0, v1, v2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
@@ -589,7 +589,7 @@ define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, ptr addrspa
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v12
 ; GFX12-NEXT:    image_load v[0:4], [v7, v6, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v8, v4, s[8:9]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -634,7 +634,7 @@ define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i
 ; GFX12-LABEL: load_cube:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], [v0, v1, v2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
@@ -749,7 +749,7 @@ define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, ptr addrspace
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:4], [v0, v1, v2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE
 ; GFX12-NEXT:    v_mov_b32_e32 v5, 0
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v5, v4, s[8:9]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -794,7 +794,7 @@ define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %s
 ; GFX12-LABEL: load_1darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
@@ -910,7 +910,7 @@ define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, ptr addrsp
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v11
 ; GFX12-NEXT:    image_load v[0:4], [v6, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v7, v4, s[8:9]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -955,7 +955,7 @@ define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t
 ; GFX12-LABEL: load_2darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], [v0, v1, v2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1070,7 +1070,7 @@ define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, ptr addrsp
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:4], [v0, v1, v2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX12-NEXT:    v_mov_b32_e32 v5, 0
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v5, v4, s[8:9]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -1115,7 +1115,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
 ; GFX12-LABEL: load_2dmsaa:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], [v0, v1, v2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1236,7 +1236,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, ptr addrsp
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v12
 ; GFX12-NEXT:    image_load v[0:4], [v7, v6, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v8, v4, s[8:9]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -1281,7 +1281,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i3
 ; GFX12-LABEL: load_2darraymsaa:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], [v0, v1, v2, v3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1407,7 +1407,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr ad
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v13
 ; GFX12-NEXT:    image_load v[0:4], [v8, v7, v6, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v9, v4, s[8:9]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -1452,7 +1452,7 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, i32 %s, i32 %mi
 ; GFX12-LABEL: load_mip_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load_mip v[0:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1562,7 +1562,7 @@ define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspa
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load_mip v[0:4], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX12-NEXT:    v_mov_b32_e32 v5, 0
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v5, v4, s[8:9]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -1607,7 +1607,7 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
 ; GFX12-LABEL: load_mip_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load_mip v[0:3], [v0, v1, v2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -1728,7 +1728,7 @@ define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, ptr addrspa
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v12
 ; GFX12-NEXT:    image_load_mip v[0:4], [v7, v6, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v8, v4, s[8:9]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -1789,7 +1789,7 @@ define amdgpu_ps float @load_1d_V2_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX12-NEXT:    image_load v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -1849,7 +1849,7 @@ define amdgpu_ps float @load_1d_V1_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX12-NEXT:    image_load v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -1909,7 +1909,7 @@ define amdgpu_ps float @load_mip_2d_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s, i3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-NEXT:    image_load_mip v[3:4], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -1969,7 +1969,7 @@ define amdgpu_ps float @load_mip_2d_tfe_nouse(<8 x i32> inreg %rsrc, i32 %s, i32
 ; GFX12-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-NEXT:    image_load_mip v[3:4], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -2029,7 +2029,7 @@ define amdgpu_ps float @load_mip_2d_tfe_nouse_V2(<8 x i32> inreg %rsrc, i32 %s,
 ; GFX12-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-NEXT:    image_load_mip v[3:4], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -2089,7 +2089,7 @@ define amdgpu_ps float @load_mip_2d_tfe_nouse_V1(<8 x i32> inreg %rsrc, i32 %s,
 ; GFX12-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-NEXT:    image_load_mip v[3:4], [v0, v1, v2], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -2194,7 +2194,7 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask3(<8 x i32> inreg %rsrc, ptr a
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
 ; GFX12-NEXT:    image_load v[0:3], v4, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v5, v3, s[8:9]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -2292,7 +2292,7 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, ptr a
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX12-NEXT:    image_load v[0:2], v3, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v4, v2, s[8:9]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -2381,7 +2381,7 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask1(<8 x i32> inreg %rsrc, ptr a
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
 ; GFX12-NEXT:    image_load v[0:1], v2, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v3, v1, s[8:9]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -2470,7 +2470,7 @@ define amdgpu_ps <2 x float> @load_1d_tfe_V2_dmask1(<8 x i32> inreg %rsrc, ptr a
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
 ; GFX12-NEXT:    image_load v[0:1], v2, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v3, v1, s[8:9]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -2516,7 +2516,7 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
 ; GFX12-LABEL: load_mip_3d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load_mip v[0:3], [v0, v1, v2, v3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -2557,7 +2557,7 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %
 ; GFX12-LABEL: load_mip_cube:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load_mip v[0:3], [v0, v1, v2, v3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -2598,7 +2598,7 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, i32 %s, i3
 ; GFX12-LABEL: load_mip_1darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load_mip v[0:3], [v0, v1, v2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32 15, i32 %s, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -2639,7 +2639,7 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, i32 %s, i3
 ; GFX12-LABEL: load_mip_2darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load_mip v[0:3], [v0, v1, v2, v3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3296,7 +3296,7 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, i32 %mip) {
 ; GFX12-LABEL: getresinfo_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3337,7 +3337,7 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, i32 %mip) {
 ; GFX12-LABEL: getresinfo_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3378,7 +3378,7 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, i32 %mip) {
 ; GFX12-LABEL: getresinfo_3d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.3d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3419,7 +3419,7 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, i32 %mip) {
 ; GFX12-LABEL: getresinfo_cube:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.cube.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3460,7 +3460,7 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, i32 %mip
 ; GFX12-LABEL: getresinfo_1darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1darray.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3501,7 +3501,7 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, i32 %mip
 ; GFX12-LABEL: getresinfo_2darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darray.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3542,7 +3542,7 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, i32 %mip)
 ; GFX12-LABEL: getresinfo_2dmsaa:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2dmsaa.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3583,7 +3583,7 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, i32
 ; GFX12-LABEL: getresinfo_2darraymsaa:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darraymsaa.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3624,7 +3624,7 @@ define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-LABEL: load_1d_V1:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v0, v0, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3665,7 +3665,7 @@ define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-LABEL: load_1d_V2:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:1], v0, s[0:7] dmask:0x9 dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 9, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3794,7 +3794,7 @@ define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-LABEL: load_1d_glc:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_LOAD_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1)
@@ -3835,7 +3835,7 @@ define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-LABEL: load_1d_slc:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_LOAD_HT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 2)
@@ -3876,7 +3876,7 @@ define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX12-LABEL: load_1d_glc_slc:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_LOAD_LU
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 3)
@@ -4049,7 +4049,7 @@ define amdgpu_ps <3 x float> @getresinfo_dmask7(<8 x i32> inreg %rsrc, <4 x floa
 ; GFX12-LABEL: getresinfo_dmask7:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_get_resinfo v[0:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %r = call <3 x float> @llvm.amdgcn.image.getresinfo.1d.v3f32.i32(i32 7, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -4090,7 +4090,7 @@ define amdgpu_ps <2 x float> @getresinfo_dmask3(<8 x i32> inreg %rsrc, <4 x floa
 ; GFX12-LABEL: getresinfo_dmask3:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_get_resinfo v[0:1], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %r = call <2 x float> @llvm.amdgcn.image.getresinfo.1d.v2f32.i32(i32 3, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -4131,7 +4131,7 @@ define amdgpu_ps float @getresinfo_dmask1(<8 x i32> inreg %rsrc, <4 x float> %vd
 ; GFX12-LABEL: getresinfo_dmask1:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_get_resinfo v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %r = call float @llvm.amdgcn.image.getresinfo.1d.f32.i32(i32 1, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -4224,7 +4224,7 @@ define amdgpu_ps void @image_store_wait(<8 x i32> inreg %arg, <8 x i32> inreg %a
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX12-NEXT:    image_load v[0:3], v4, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    image_store v[0:3], v4, s[16:23] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4306,9 +4306,9 @@ define amdgpu_ps float @image_load_mmo(<8 x i32> inreg %rsrc, ptr addrspace(3) %
 ; GFX12-NEXT:    image_load v1, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    ds_store_2addr_b32 v0, v2, v2 offset1:4
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v1
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   store float 0.000000e+00, ptr addrspace(3) %lds
   %c0 = extractelement <2 x i32> %c, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
index bee9cafffa2f8c..3210565c96abb0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
@@ -33,7 +33,7 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -69,7 +69,7 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f16(i32 1, half %s, half %t, half %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -105,7 +105,7 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f16(i32 1, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -141,7 +141,7 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_gather4_c v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32(i32 1, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -177,7 +177,7 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_gather4_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f16(i32 1, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -215,7 +215,7 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_gather4_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.2d.v4f32.f32(i32 1, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -251,7 +251,7 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_gather4_b v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f16.f16(i32 1, half %bias, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -287,7 +287,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_gather4_c_b v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f16.f16(i32 1, half %bias, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -325,7 +325,7 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_gather4_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f16.f16(i32 1, half %bias, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -364,7 +364,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_gather4_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f16.f16(i32 1, half %bias, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -391,7 +391,7 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX12-NEXT:    image_gather4_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f16(i32 1, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -420,7 +420,7 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX12-NEXT:    image_gather4_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32 1, float %zcompare, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -447,7 +447,7 @@ define amdgpu_ps <4 x float> @gather4_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX12-NEXT:    image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -474,7 +474,7 @@ define amdgpu_ps <4 x float> @gather4_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX12-NEXT:    image_gather4_c_lz v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32 1, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.dim.ll
index d0ee235bcf7b34..3a0762f1c4f77b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.dim.ll
@@ -1,14 +1,15 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefixes=GCN,PRE-GFX10 %s
-; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefixes=GCN,PRE-GFX10 %s
-; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck --check-prefixes=GCN,GFX10 %s
-; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefixes=GCN,GFX10 %s
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefixes=GCN,PRE-GFX10,PRE-GFX12 %s
+; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefixes=GCN,PRE-GFX10,PRE-GFX12 %s
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck --check-prefixes=GCN,GFX10,PRE-GFX12 %s
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefixes=GCN,GFX10,PRE-GFX12 %s
 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck --check-prefixes=GCN,GFX12 %s
 
 ; GCN-LABEL: {{^}}getlod_1d:
 ; PRE-GFX10: image_get_lod v[0:3], v0, s[0:7], s[8:11] dmask:0xf{{$}}
 ; GFX10: image_get_lod v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX12: image_get_lod v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GCN: s_waitcnt vmcnt(0)
+; PRE-GFX12: s_waitcnt vmcnt(0)
+; GFX12: s_wait_samplecnt 0x0
 define amdgpu_ps <4 x float> @getlod_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.getlod.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -19,7 +20,8 @@ main_body:
 ; PRE-GFX10: image_get_lod v[0:1], v[0:1], s[0:7], s[8:11] dmask:0x3{{$}}
 ; GFX10: image_get_lod v[0:1], v[0:1], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_2D
 ; GFX12: image_get_lod v[0:1], [v0, v1], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_2D
-; GCN: s_waitcnt vmcnt(0)
+; PRE-GFX12: s_waitcnt vmcnt(0)
+; GFX12: s_wait_samplecnt 0x0
 define amdgpu_ps <2 x float> @getlod_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
 main_body:
   %r = call <2 x float> @llvm.amdgcn.image.getlod.2d.v2f32.f32(i32 3, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll
index 863dd357de43d4..9b5ae45688c210 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll
@@ -12,7 +12,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
 ; GFX12-LABEL: load_2dmsaa:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_msaa_load v[0:3], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm ; encoding: [0x06,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -32,7 +32,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, ptr addrsp
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_msaa_load v[0:4], [v0, v1, v2], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x0e,0x20,0x86,0xe4,0x00,0x01,0x00,0x00,0x00,0x01,0x02,0x00]
 ; GFX12-NEXT:    v_mov_b32_e32 v5, 0 ; encoding: [0x80,0x02,0x0a,0x7e]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    global_store_b32 v5, v4, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x02,0x05,0x00,0x00,0x00]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -53,7 +53,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i3
 ; GFX12-LABEL: load_2darraymsaa:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_msaa_load v[0:3], [v0, v1, v2, v3], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm ; encoding: [0x07,0x20,0x06,0xe5,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x03]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i32(i32 4, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -73,7 +73,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr ad
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_msaa_load v[0:4], [v0, v1, v2, v3], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x0f,0x20,0x06,0xe6,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x03]
 ; GFX12-NEXT:    v_mov_b32_e32 v5, 0 ; encoding: [0x80,0x02,0x0a,0x7e]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    global_store_b32 v5, v4, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x02,0x05,0x00,0x00,0x00]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -94,7 +94,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa_glc(<8 x i32> inreg %rsrc, i32 %s, i32
 ; GFX12-LABEL: load_2dmsaa_glc:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_msaa_load v[0:3], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm th:TH_LOAD_NT ; encoding: [0x06,0x20,0x46,0xe4,0x00,0x00,0x10,0x00,0x00,0x01,0x02,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 1)
@@ -111,7 +111,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa_slc(<8 x i32> inreg %rsrc, i32 %s, i32
 ; GFX12-LABEL: load_2dmsaa_slc:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_msaa_load v[0:3], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm th:TH_LOAD_HT ; encoding: [0x06,0x20,0x46,0xe4,0x00,0x00,0x20,0x00,0x00,0x01,0x02,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 2)
@@ -128,7 +128,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa_glc_slc(<8 x i32> inreg %rsrc, i32 %s,
 ; GFX12-LABEL: load_2dmsaa_glc_slc:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_msaa_load v[0:3], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm th:TH_LOAD_LU ; encoding: [0x06,0x20,0x46,0xe4,0x00,0x00,0x30,0x00,0x00,0x01,0x02,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 3)
@@ -145,7 +145,7 @@ define amdgpu_ps <4 x half> @load_2dmsaa_d16(<8 x i32> inreg %rsrc, i32 %s, i32
 ; GFX12-LABEL: load_2dmsaa_d16:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_msaa_load v[0:1], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm d16 ; encoding: [0x26,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x half> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f16.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -165,7 +165,7 @@ define amdgpu_ps <4 x half> @load_2dmsaa_tfe_d16(<8 x i32> inreg %rsrc, ptr addr
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_msaa_load v[0:2], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe d16 ; encoding: [0x2e,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x00]
 ; GFX12-NEXT:    v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    global_store_b32 v3, v2, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x03,0x00,0x00,0x00]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -186,7 +186,7 @@ define amdgpu_ps <4 x half> @load_2darraymsaa_d16(<8 x i32> inreg %rsrc, i32 %s,
 ; GFX12-LABEL: load_2darraymsaa_d16:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_msaa_load v[0:1], [v0, v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm d16 ; encoding: [0x27,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x03]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x half> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f16.i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -206,7 +206,7 @@ define amdgpu_ps <4 x half> @load_2darraymsaa_tfe_d16(<8 x i32> inreg %rsrc, ptr
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_msaa_load v[0:2], [v0, v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe d16 ; encoding: [0x2f,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x03]
 ; GFX12-NEXT:    v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    global_store_b32 v3, v2, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x03,0x00,0x00,0x00]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -229,7 +229,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa_a16(<8 x i32> inreg %rsrc, i16 %s, i16
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
 ; GFX12-NEXT:    image_msaa_load v[0:3], [v0, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x46,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x02,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i16(i32 1, i16 %s, i16 %t, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -250,7 +250,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_a16(<8 x i32> inreg %rsrc, i16 %s
 ; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
 ; GFX12-NEXT:    image_msaa_load v[0:3], [v0, v2], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 ; encoding: [0x47,0x20,0x06,0xe5,0x00,0x00,0x00,0x00,0x00,0x02,0x00,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i16(i32 4, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
index b0c2f7ac818451..4dfa3aa3aa7c53 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
@@ -38,7 +38,7 @@ define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -84,7 +84,7 @@ define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -130,7 +130,7 @@ define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16(i32 15, half %s, half %t, half %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -176,7 +176,7 @@ define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f16(i32 15, half %s, half %t, half %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -222,7 +222,7 @@ define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16(i32 15, half %s, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -268,7 +268,7 @@ define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16(i32 15, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -309,7 +309,7 @@ define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_c v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f16(i32 15, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -355,7 +355,7 @@ define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_c v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -401,7 +401,7 @@ define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_cl v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f16(i32 15, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -447,7 +447,7 @@ define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f16(i32 15, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -493,7 +493,7 @@ define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_c_cl v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f16(i32 15, float %zcompare, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -541,7 +541,7 @@ define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -582,7 +582,7 @@ define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_b v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f16.f16(i32 15, half %bias, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -628,7 +628,7 @@ define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_b v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f16.f16(i32 15, half %bias, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -669,7 +669,7 @@ define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_c_b v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -715,7 +715,7 @@ define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_c_b v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -761,7 +761,7 @@ define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_b_cl v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f16.f16(i32 15, half %bias, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -809,7 +809,7 @@ define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f16.f16(i32 15, half %bias, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -855,7 +855,7 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_c_b_cl v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -904,7 +904,7 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -933,7 +933,7 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-LABEL: sample_d_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_d_g16 v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -975,7 +975,7 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX12-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1025,7 +1025,7 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX12-NEXT:    v_perm_b32 v9, v7, v6, 0x5040100
 ; GFX12-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v3, v[8:10]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, half %s, half %t, half %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1054,7 +1054,7 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12-LABEL: sample_c_d_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1098,7 +1098,7 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
 ; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX12-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1132,7 +1132,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
 ; GFX12-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1174,7 +1174,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX12-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1208,7 +1208,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
 ; GFX12-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1252,7 +1252,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX12-NEXT:    v_perm_b32 v6, v6, v5, 0x5040100
 ; GFX12-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v[6:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1286,7 +1286,7 @@ define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX12-NEXT:    image_sample_l v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f16(i32 15, half %s, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1320,7 +1320,7 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX12-NEXT:    image_sample_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f16(i32 15, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1354,7 +1354,7 @@ define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX12-NEXT:    image_sample_c_l v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f16(i32 15, float %zcompare, half %s, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1390,7 +1390,7 @@ define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX12-NEXT:    image_sample_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1419,7 +1419,7 @@ define amdgpu_ps <4 x float> @sample_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX12-LABEL: sample_lz_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f16(i32 15, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1453,7 +1453,7 @@ define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX12-NEXT:    image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f16(i32 15, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1482,7 +1482,7 @@ define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-LABEL: sample_c_lz_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_c_lz v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f16(i32 15, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1516,7 +1516,7 @@ define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX12-NEXT:    image_sample_c_lz v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1564,7 +1564,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX12-NEXT:    v_perm_b32 v6, v5, v4, 0x5040100
 ; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
 ; GFX12-NEXT:    image_sample_c_d_o_g16 v0, [v0, v1, v2, v[6:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f16.f16(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1612,7 +1612,7 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
 ; GFX12-NEXT:    v_perm_b32 v6, v5, v4, 0x5040100
 ; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
 ; GFX12-NEXT:    image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v[6:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f16(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
index 950db78d2d502d..d90c193514eb0e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
@@ -49,7 +49,7 @@ define amdgpu_ps half @image_sample_2d_f16(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v0, [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D d16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %tex = call half @llvm.amdgcn.image.sample.2d.f16.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -144,7 +144,7 @@ define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX12-NEXT:    image_sample v[0:1], [v3, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    global_store_b32 v4, v1, s[12:13]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -185,7 +185,7 @@ define amdgpu_ps float @image_sample_c_d_1d_v2f16(<8 x i32> inreg %rsrc, <4 x i3
 ; GFX12-LABEL: image_sample_c_d_1d_v2f16:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_c_d v0, [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D d16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %tex = call <2 x half> @llvm.amdgcn.image.sample.c.d.1d.v2f16.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -251,7 +251,7 @@ define amdgpu_ps <2 x float> @image_sample_c_d_1d_v2f16_tfe(<8 x i32> inreg %rsr
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX12-NEXT:    image_sample_c_d v[0:1], [v5, v4, v2, v3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe d16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %tex = call {<2 x half>,i32} @llvm.amdgcn.image.sample.c.d.1d.v2f16i32.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
@@ -310,7 +310,7 @@ define amdgpu_ps <2 x float> @image_sample_b_2d_v3f16(<8 x i32> inreg %rsrc, <4
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_b v[0:1], [v0, v1, v2], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D d16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %tex = call <3 x half> @llvm.amdgcn.image.sample.b.2d.v3f16.f32.f32(i32 7, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -404,7 +404,7 @@ define amdgpu_ps <4 x float> @image_sample_b_2d_v3f16_tfe(<8 x i32> inreg %rsrc,
 ; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_b v[0:2], [v5, v4, v3], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D tfe d16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %tex = call {<3 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v3f16i32.f32.f32(i32 7, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
@@ -467,7 +467,7 @@ define amdgpu_ps <2 x float> @image_sample_b_2d_v4f16(<8 x i32> inreg %rsrc, <4
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_b v[0:1], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D d16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %tex = call <4 x half> @llvm.amdgcn.image.sample.b.2d.v4f16.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
@@ -561,7 +561,7 @@ define amdgpu_ps <4 x float> @image_sample_b_2d_v4f16_tfe(<8 x i32> inreg %rsrc,
 ; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_b v[0:2], [v5, v4, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D tfe d16
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %tex = call {<4 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v4f16i32.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
index 86bf6f03bcfd1c..d0a1597c856c8b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
@@ -39,7 +39,7 @@ define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -139,7 +139,7 @@ define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v10
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX12-NEXT:    image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    global_store_b32 v6, v4, s[12:13]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -196,7 +196,7 @@ define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_1(<8 x i32> inreg %
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
@@ -255,7 +255,7 @@ define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_2(<8 x i32> inreg %
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x2 dim:SQ_RSRC_IMG_1D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
@@ -314,7 +314,7 @@ define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_3(<8 x i32> inreg %
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_1D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
@@ -373,7 +373,7 @@ define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_4(<8 x i32> inreg %
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x8 dim:SQ_RSRC_IMG_1D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
@@ -446,7 +446,7 @@ define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_12(<8 x i32> inreg
 ; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
@@ -521,7 +521,7 @@ define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_24(<8 x i32> inreg
 ; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
@@ -589,7 +589,7 @@ define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_134(<8 x i32> inreg
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xd dim:SQ_RSRC_IMG_1D tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
@@ -699,7 +699,7 @@ define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v10
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s14
 ; GFX12-NEXT:    image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D lwe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    global_store_b32 v6, v4, s[12:13]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -744,7 +744,7 @@ define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -785,7 +785,7 @@ define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -826,7 +826,7 @@ define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32 15, float %s, float %t, float %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -867,7 +867,7 @@ define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 15, float %s, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -908,7 +908,7 @@ define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32 15, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -949,7 +949,7 @@ define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_c v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 15, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -990,7 +990,7 @@ define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_c v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1031,7 +1031,7 @@ define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_cl v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f32(i32 15, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1072,7 +1072,7 @@ define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_cl v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f32(i32 15, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1113,7 +1113,7 @@ define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_c_cl v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32 15, float %zcompare, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1154,7 +1154,7 @@ define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_c_cl v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1195,7 +1195,7 @@ define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_b v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float %bias, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1236,7 +1236,7 @@ define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_b v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1277,7 +1277,7 @@ define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_c_b v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1318,7 +1318,7 @@ define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_c_b v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1359,7 +1359,7 @@ define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_b_cl v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 15, float %bias, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1400,7 +1400,7 @@ define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_b_cl v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32 15, float %bias, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1441,7 +1441,7 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_c_b_cl v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1482,7 +1482,7 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample_c_b_cl v[0:3], [v0, v1, v2, v[3:4]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1511,7 +1511,7 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-LABEL: sample_d_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_d v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1540,7 +1540,7 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-LABEL: sample_d_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_d v[0:3], [v0, v1, v2, v[3:5]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1569,7 +1569,7 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12-LABEL: sample_c_d_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_c_d v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1598,7 +1598,7 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12-LABEL: sample_c_d_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_c_d v[0:3], [v0, v1, v2, v[3:6]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1627,7 +1627,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-LABEL: sample_d_cl_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_d_cl v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1656,7 +1656,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-LABEL: sample_d_cl_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_d_cl v[0:3], [v0, v1, v2, v[3:6]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1685,7 +1685,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX12-LABEL: sample_c_d_cl_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_c_d_cl v[0:3], [v0, v1, v2, v[3:4]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1714,7 +1714,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX12-LABEL: sample_c_d_cl_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_c_d_cl v[0:3], [v0, v1, v2, v[3:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1743,7 +1743,7 @@ define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-LABEL: sample_l_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_l v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32(i32 15, float %s, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1772,7 +1772,7 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-LABEL: sample_l_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_l v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 15, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1801,7 +1801,7 @@ define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12-LABEL: sample_c_l_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_c_l v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32 15, float %zcompare, float %s, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1830,7 +1830,7 @@ define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12-LABEL: sample_c_l_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_c_l v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1859,7 +1859,7 @@ define amdgpu_ps <4 x float> @sample_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX12-LABEL: sample_lz_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1888,7 +1888,7 @@ define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX12-LABEL: sample_lz_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_lz v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1917,7 +1917,7 @@ define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-LABEL: sample_c_lz_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_c_lz v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32 15, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1946,7 +1946,7 @@ define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-LABEL: sample_c_lz_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_c_lz v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -1975,7 +1975,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX12-LABEL: sample_c_d_o_2darray_V1:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_c_d_o v0, [v0, v1, v2, v[3:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f32.f32(i32 4, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -2039,7 +2039,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x
 ; GFX12-NEXT:    v_mov_b32_e32 v12, v11
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
 ; GFX12-NEXT:    image_sample_c_d_o v[0:1], [v10, v9, v2, v[3:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    global_store_b32 v11, v1, s[12:13]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -2072,7 +2072,7 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
 ; GFX12-LABEL: sample_c_d_o_2darray_V2:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_c_d_o v[0:1], [v0, v1, v2, v[3:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -2132,7 +2132,7 @@ define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc,
 ; GFX12-NEXT:    v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v10, v1
 ; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    image_sample_c_d_o v[0:2], [v11, v10, v9, v[3:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call {<2 x float>, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32i32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
@@ -2181,7 +2181,7 @@ define amdgpu_ps <4 x float> @sample_1d_unorm(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 1, i32 0, i32 0)
@@ -2222,7 +2222,7 @@ define amdgpu_ps <4 x float> @sample_1d_glc(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_LOAD_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 1)
@@ -2263,7 +2263,7 @@ define amdgpu_ps <4 x float> @sample_1d_slc(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_LOAD_HT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 2)
@@ -2304,7 +2304,7 @@ define amdgpu_ps <4 x float> @sample_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_LOAD_LU
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 3)
@@ -2345,7 +2345,7 @@ define amdgpu_ps float @adjust_writemask_sample_0(<8 x i32> inreg %rsrc, <4 x i3
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -2387,7 +2387,7 @@ define amdgpu_ps <2 x float> @adjust_writemask_sample_01(<8 x i32> inreg %rsrc,
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -2429,7 +2429,7 @@ define amdgpu_ps <3 x float> @adjust_writemask_sample_012(<8 x i32> inreg %rsrc,
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:2], v0, s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -2471,7 +2471,7 @@ define amdgpu_ps <2 x float> @adjust_writemask_sample_12(<8 x i32> inreg %rsrc,
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -2513,7 +2513,7 @@ define amdgpu_ps <2 x float> @adjust_writemask_sample_03(<8 x i32> inreg %rsrc,
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x9 dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -2555,7 +2555,7 @@ define amdgpu_ps <2 x float> @adjust_writemask_sample_13(<8 x i32> inreg %rsrc,
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -2597,7 +2597,7 @@ define amdgpu_ps <3 x float> @adjust_writemask_sample_123(<8 x i32> inreg %rsrc,
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:2], v0, s[0:7], s[8:11] dmask:0xe dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -2660,7 +2660,7 @@ define amdgpu_ps <2 x float> @adjust_writemask_sample_123_to_12(<8 x i32> inreg
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 14, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -2702,7 +2702,7 @@ define amdgpu_ps <2 x float> @adjust_writemask_sample_013_to_13(<8 x i32> inreg
 ; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX12-NEXT:    image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 11, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
index 134c736c6472f9..2ec7acb9934075 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
@@ -19,7 +19,7 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-LABEL: sample_d_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_d_g16 v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x40,0xce,0xe7,0x00,0x00,0x00,0x04,0x00,0x01,0x02,0x00]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -48,7 +48,7 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
 ; GFX12-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x01,0x40,0xce,0xe7,0x00,0x00,0x00,0x04,0x00,0x02,0x04,0x05]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -79,7 +79,7 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd6,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05]
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
 ; GFX12-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v3, v[5:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x02,0x40,0xce,0xe7,0x00,0x00,0x00,0x04,0x00,0x02,0x03,0x05]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -102,7 +102,7 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12-LABEL: sample_c_d_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x80,0xce,0xe7,0x00,0x00,0x00,0x04,0x00,0x01,0x02,0x03]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -131,7 +131,7 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd6,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05]
 ; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100 ; encoding: [0x01,0x00,0x44,0xd6,0x02,0x03,0xfe,0x03,0x00,0x01,0x04,0x05]
 ; GFX12-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v3, v[5:6]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x01,0x80,0xce,0xe7,0x00,0x00,0x00,0x04,0x00,0x01,0x03,0x05]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -154,7 +154,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-LABEL: sample_d_cl_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0xc0,0xd7,0xe7,0x00,0x00,0x00,0x04,0x00,0x01,0x02,0x03]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -183,7 +183,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
 ; GFX12-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v[5:6]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x01,0xc0,0xd7,0xe7,0x00,0x00,0x00,0x04,0x00,0x02,0x04,0x05]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -206,7 +206,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX12-LABEL: sample_c_d_cl_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v[3:4]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x00,0xd5,0xe7,0x00,0x00,0x00,0x04,0x00,0x01,0x02,0x03]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -237,7 +237,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX12-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd6,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05]
 ; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100 ; encoding: [0x01,0x00,0x44,0xd6,0x02,0x03,0xfe,0x03,0x00,0x01,0x04,0x05]
 ; GFX12-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v[5:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x01,0x00,0xd5,0xe7,0x00,0x00,0x00,0x04,0x00,0x01,0x03,0x05]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -270,7 +270,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX12-NEXT:    v_perm_b32 v5, v5, v4, 0x5040100 ; encoding: [0x05,0x00,0x44,0xd6,0x05,0x09,0xfe,0x03,0x00,0x01,0x04,0x05]
 ; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
 ; GFX12-NEXT:    image_sample_c_d_o_g16 v0, [v0, v1, v2, v[5:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x05,0x00,0x0f,0xe5,0x00,0x00,0x00,0x04,0x00,0x01,0x02,0x05]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -303,7 +303,7 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
 ; GFX12-NEXT:    v_perm_b32 v5, v5, v4, 0x5040100 ; encoding: [0x05,0x00,0x44,0xd6,0x05,0x09,0xfe,0x03,0x00,0x01,0x04,0x05]
 ; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
 ; GFX12-NEXT:    image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v[5:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x05,0x00,0x8f,0xe5,0x00,0x00,0x00,0x04,0x00,0x01,0x02,0x05]
-; GFX12-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
index e12dcc8795c08a..ba9e16f4320d52 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
@@ -19,7 +19,7 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-LABEL: sample_d_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_d_g16 v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -48,7 +48,7 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX12-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -79,7 +79,7 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX12-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX12-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v3, v[5:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -102,7 +102,7 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12-LABEL: sample_c_d_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -131,7 +131,7 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX12-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
 ; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX12-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v3, v[5:6]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -154,7 +154,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-LABEL: sample_d_cl_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -183,7 +183,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX12-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v[5:6]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -206,7 +206,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX12-LABEL: sample_c_d_cl_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v[3:4]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -237,7 +237,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX12-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
 ; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX12-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v[5:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -270,7 +270,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX12-NEXT:    v_perm_b32 v5, v5, v4, 0x5040100
 ; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
 ; GFX12-NEXT:    image_sample_c_d_o_g16 v0, [v0, v1, v2, v[5:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
@@ -303,7 +303,7 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
 ; GFX12-NEXT:    v_perm_b32 v5, v5, v4, 0x5040100
 ; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
 ; GFX12-NEXT:    image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v[5:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll
index 195c5dabb4d461..78b299d6dc1d77 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll
@@ -12,12 +12,16 @@
 ; GFX12: ds_direct_load v{{[0-9]+}}
 ; GCN: s_mov_b32 m0
 ; GFX11: lds_direct_load v{{[0-9]+}}
-; GCN: s_waitcnt expcnt(2)
+; GFX11: s_waitcnt expcnt(2)
+; GFX12: ds_direct_load v{{[0-9]+}}
+; GFX12: s_wait_expcnt 0x2
 ; GCN: v_add_f32
 ; GCN: buffer_store_b32
-; GCN: s_waitcnt expcnt(1)
+; GFX11: s_waitcnt expcnt(1)
+; GFX12: s_wait_expcnt 0x1
 ; GCN: buffer_store_b32
-; GCN: s_waitcnt expcnt(0)
+; GFX11: s_waitcnt expcnt(0)
+; GFX12: s_wait_expcnt 0x0
 ; GCN: buffer_store_b32
 ; GCN: buffer_store_b32
 ; GCN: buffer_store_b32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll
index 1ab753d75fe031..3d3db711badee5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll
@@ -15,16 +15,20 @@
 ; GFX12-DAG: ds_param_load v{{[0-9]+}}, attr0.z
 ; GFX12-DAG: ds_param_load v{{[0-9]+}}, attr0.w
 ; GFX12-DAG: ds_param_load v{{[0-9]+}}, attr1.x
-; GCN: s_waitcnt expcnt(4)
+; GFX11-DAG: s_waitcnt expcnt(4)
 ; GCN: v_add_f32
 ; GCN: buffer_store_b32
-; GCN: s_waitcnt expcnt(3)
+; GFX11-DAG: s_waitcnt expcnt(3)
+; GFX12-DAG: s_wait_expcnt 0x3
 ; GCN: buffer_store_b32
-; GCN: s_waitcnt expcnt(2)
+; GFX11-DAG: s_waitcnt expcnt(2)
+; GFX12-DAG: s_wait_expcnt 0x2
 ; GCN: buffer_store_b32
-; GCN: s_waitcnt expcnt(1)
+; GFX11-DAG: s_waitcnt expcnt(1)
+; GFX12-DAG: s_wait_expcnt 0x1
 ; GCN: buffer_store_b32
-; GCN: s_waitcnt expcnt(0)
+; GFX11-DAG: s_waitcnt expcnt(0)
+; GFX12-DAG: s_wait_expcnt 0x0
 ; GCN: buffer_store_b32
 ; GCN: buffer_store_b32
 define amdgpu_ps void @lds_param_load(ptr addrspace(8) inreg %buf, i32 inreg %arg) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index d630ba946dca34..ac85313823d9c3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -43,7 +43,7 @@ define amdgpu_kernel void @v_permlane16_b32_vss(ptr addrspace(1) %out, i32 %src0
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s7, s0
@@ -86,7 +86,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0
 ; GFX12-LABEL: v_permlane16_b32_vii:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, 1, 2
@@ -132,7 +132,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll(ptr addrspace(1) %out, i32 %src0
 ; GFX12-LABEL: v_permlane16_b32_vll:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-NEXT:    s_movk_i32 s2, 0x1234
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
@@ -208,7 +208,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0
 ; GFX12-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s3, v1
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
@@ -226,7 +226,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0
 ; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
@@ -295,7 +295,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0
 ; GFX12-SDAG-LABEL: v_permlane16_b32_vvs:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
@@ -310,7 +310,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
@@ -381,7 +381,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv(ptr addrspace(1) %out, i32 %src0
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
@@ -399,7 +399,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv(ptr addrspace(1) %out, i32 %src0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -444,7 +444,7 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi(ptr addrspace(1) %out, i32 %s
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0]
@@ -489,7 +489,7 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc(ptr addrspace(1) %out, i32 %s
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1]
@@ -534,7 +534,7 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc(ptr addrspace(1) %out, i32
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1]
@@ -579,7 +579,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vss(ptr addrspace(1) %out, i32 %src
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s7, s0
@@ -622,7 +622,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii(ptr addrspace(1) %out, i32 %src
 ; GFX12-LABEL: v_permlanex16_b32_vii:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
@@ -668,7 +668,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll(ptr addrspace(1) %out, i32 %src
 ; GFX12-LABEL: v_permlanex16_b32_vll:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-NEXT:    s_movk_i32 s2, 0x1234
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
@@ -744,7 +744,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src
 ; GFX12-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s3, v1
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
@@ -762,7 +762,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src
 ; GFX12-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
@@ -831,7 +831,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
@@ -846,7 +846,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
@@ -917,7 +917,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv(ptr addrspace(1) %out, i32 %src
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
@@ -935,7 +935,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv(ptr addrspace(1) %out, i32 %src
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -980,7 +980,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi(ptr addrspace(1) %out, i32 %
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0]
@@ -1025,7 +1025,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc(ptr addrspace(1) %out, i32 %
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1]
@@ -1070,7 +1070,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc(ptr addrspace(1) %out, i3
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1]
@@ -1114,7 +1114,7 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid(ptr addrspace(1) %out, i32 %
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, s3
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -1157,7 +1157,7 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid(ptr addrspace(1) %out, i32
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, s3
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -1230,7 +1230,7 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %sr
 ; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v0, s2, s3
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
@@ -1244,7 +1244,7 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %sr
 ; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3039
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v0, s2, s3
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1289,7 +1289,7 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi(ptr addrspace(1) %out, i32
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -1333,7 +1333,7 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc(ptr addrspace(1) %out, i32
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -1377,7 +1377,7 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -1421,7 +1421,7 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid(ptr addrspace(1) %out, i32
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -1464,7 +1464,7 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid(ptr addrspace(1) %out, i3
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -1537,7 +1537,7 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %s
 ; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v0, s2, s3
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
@@ -1551,7 +1551,7 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %s
 ; GFX12-GISEL-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3039
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v0, s2, s3
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1596,7 +1596,7 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi(ptr addrspace(1) %out, i32
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -1640,7 +1640,7 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc(ptr addrspace(1) %out, i32
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -1684,7 +1684,7 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc(ptr addrspace(1) %out,
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x30
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
index f865418befed7b..7c439ab5afed39 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
@@ -12,7 +12,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %sr
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v0, v0, v1
@@ -24,7 +24,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %sr
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v0, v0, v1
@@ -43,7 +43,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %sr
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 1
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v1, v1, v0
@@ -55,7 +55,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %sr
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vi:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v1, v1, v0
@@ -74,7 +74,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %sr
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0xc1d1
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v1, v1, v0
@@ -86,7 +86,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %sr
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vl:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v1, v1, v0
@@ -104,7 +104,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %s
 ; GFX12-SDAG-LABEL: v_permlane16var_b32_vvv:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v1, v1, v0
@@ -116,7 +116,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %s
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vvv:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v1, v1, v0
@@ -136,7 +136,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0]
@@ -148,7 +148,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_fi:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0]
@@ -167,7 +167,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1]
@@ -179,7 +179,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_bc:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1]
@@ -198,7 +198,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1]
@@ -210,7 +210,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_fi_bc:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1]
@@ -229,7 +229,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %s
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v0, v0, v1
@@ -241,7 +241,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %s
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v0, v0, v1
@@ -260,7 +260,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %s
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 1
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v1, v1, v0
@@ -272,7 +272,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %s
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vi:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v1, v1, v0
@@ -291,7 +291,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %s
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0xc1d1
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v1, v1, v0
@@ -303,7 +303,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %s
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vl:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v1, v1, v0
@@ -321,7 +321,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 %
 ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vvv:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v1, v1, v0
@@ -333,7 +333,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 %
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vvv:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v1, v1, v0
@@ -353,7 +353,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0]
@@ -365,7 +365,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_fi:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0]
@@ -384,7 +384,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1]
@@ -396,7 +396,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_bc:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1]
@@ -415,7 +415,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi_bc(ptr addrspace(1) %out,
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1]
@@ -427,7 +427,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi_bc(ptr addrspace(1) %out,
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_fi_bc:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1]
@@ -447,7 +447,7 @@ define amdgpu_kernel void @v_permlane16var_b32_tid_tid(ptr addrspace(1) %out, i3
 ; GFX12-SDAG-NEXT:    s_clause 0x1
 ; GFX12-SDAG-NEXT:    s_load_b32 s2, s[0:1], 0x30
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v0, v0, v1
@@ -461,7 +461,7 @@ define amdgpu_kernel void @v_permlane16var_b32_tid_tid(ptr addrspace(1) %out, i3
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b32 s2, s[0:1], 0x30
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v0, v0, v1
@@ -482,7 +482,7 @@ define amdgpu_kernel void @v_permlane16var_b32_undef_tid(ptr addrspace(1) %out,
 ; GFX12-SDAG-NEXT:    s_clause 0x1
 ; GFX12-SDAG-NEXT:    s_load_b32 s2, s[0:1], 0x30
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v0, v0, v1
@@ -496,7 +496,7 @@ define amdgpu_kernel void @v_permlane16var_b32_undef_tid(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b32 s2, s[0:1], 0x30
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v0, v0, v1
@@ -519,7 +519,7 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid(ptr addrspace(1) %out, i32
 ; GFX12-SDAG-NEXT:    s_load_b32 s2, s[0:1], 0x30
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0x3039
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v1, v0, v2
@@ -533,7 +533,7 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid(ptr addrspace(1) %out, i32
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b32 s2, s[0:1], 0x30
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v1, v0, v2
@@ -554,7 +554,7 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi(ptr addrspace(1) %out, i
 ; GFX12-SDAG-NEXT:    s_clause 0x1
 ; GFX12-SDAG-NEXT:    s_load_b32 s2, s[0:1], 0x30
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0]
@@ -568,7 +568,7 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi(ptr addrspace(1) %out, i
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b32 s2, s[0:1], 0x30
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0]
@@ -590,7 +590,7 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_bc(ptr addrspace(1) %out, i
 ; GFX12-SDAG-NEXT:    s_clause 0x1
 ; GFX12-SDAG-NEXT:    s_load_b32 s2, s[0:1], 0x30
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1]
@@ -604,7 +604,7 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_bc(ptr addrspace(1) %out, i
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b32 s2, s[0:1], 0x30
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1]
@@ -626,7 +626,7 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi_bc(ptr addrspace(1) %out
 ; GFX12-SDAG-NEXT:    s_clause 0x1
 ; GFX12-SDAG-NEXT:    s_load_b32 s2, s[0:1], 0x30
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1]
@@ -640,7 +640,7 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi_bc(ptr addrspace(1) %out
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b32 s2, s[0:1], 0x30
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1]
@@ -662,7 +662,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_tid_tid(ptr addrspace(1) %out, i
 ; GFX12-SDAG-NEXT:    s_clause 0x1
 ; GFX12-SDAG-NEXT:    s_load_b32 s2, s[0:1], 0x30
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v0, v0, v1
@@ -676,7 +676,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_tid_tid(ptr addrspace(1) %out, i
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b32 s2, s[0:1], 0x30
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v0, v0, v1
@@ -697,7 +697,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_undef_tid(ptr addrspace(1) %out,
 ; GFX12-SDAG-NEXT:    s_clause 0x1
 ; GFX12-SDAG-NEXT:    s_load_b32 s2, s[0:1], 0x30
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v0, v0, v1
@@ -711,7 +711,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_undef_tid(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b32 s2, s[0:1], 0x30
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v0, v0, v1
@@ -734,7 +734,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid(ptr addrspace(1) %out, i32
 ; GFX12-SDAG-NEXT:    s_load_b32 s2, s[0:1], 0x30
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0x3039
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v1, v0, v2
@@ -748,7 +748,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid(ptr addrspace(1) %out, i32
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b32 s2, s[0:1], 0x30
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v1, v0, v2
@@ -769,7 +769,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi(ptr addrspace(1) %out,
 ; GFX12-SDAG-NEXT:    s_clause 0x1
 ; GFX12-SDAG-NEXT:    s_load_b32 s2, s[0:1], 0x30
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0]
@@ -783,7 +783,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b32 s2, s[0:1], 0x30
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0]
@@ -805,7 +805,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_bc(ptr addrspace(1) %out,
 ; GFX12-SDAG-NEXT:    s_clause 0x1
 ; GFX12-SDAG-NEXT:    s_load_b32 s2, s[0:1], 0x30
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1]
@@ -819,7 +819,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_bc(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b32 s2, s[0:1], 0x30
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1]
@@ -841,7 +841,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi_bc(ptr addrspace(1) %ou
 ; GFX12-SDAG-NEXT:    s_clause 0x1
 ; GFX12-SDAG-NEXT:    s_load_b32 s2, s[0:1], 0x30
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1]
@@ -855,7 +855,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi_bc(ptr addrspace(1) %ou
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b32 s2, s[0:1], 0x30
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll
index e6261bb5cb1325..df8b4f9be07c40 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll
@@ -39,7 +39,7 @@ define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32>
 ; GFX12-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], null
 ; GFX12-NEXT:    buffer_load_b128 v[4:7], off, s[0:3], null th:TH_LOAD_NT
 ; GFX12-NEXT:    buffer_load_b128 v[8:11], off, s[0:3], null th:TH_LOAD_HT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0)
@@ -84,7 +84,7 @@ define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_dlc(<4 x i
 ; GFX12-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], null th:TH_LOAD_NT_RT
 ; GFX12-NEXT:    buffer_load_b128 v[4:7], off, s[0:3], null th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    buffer_load_b128 v[8:11], off, s[0:3], null th:TH_LOAD_NT_HT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 4)
@@ -118,7 +118,7 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
 ; GFX12-LABEL: buffer_load_immoffs:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], null offset:40
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 40, i32 0, i32 0)
@@ -151,7 +151,7 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    s_movk_i32 s4, 0x1ffc
 ; GFX12-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], s4 offset:4
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 4, i32 8188, i32 0)
@@ -180,7 +180,7 @@ define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) {
 ; GFX12-LABEL: buffer_load_ofs:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_b128 v[0:3], v0, s[0:3], null offen
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0)
@@ -209,7 +209,7 @@ define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
 ; GFX12-LABEL: buffer_load_ofs_imm:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_b128 v[0:3], v0, s[0:3], null offen offset:60
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %ofs = add i32 %1, 60
@@ -239,7 +239,7 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(<4 x i32> inreg) {
 ; GFX12-LABEL: buffer_load_voffset_large_12bit:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], null offset:4092
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 4092, i32 0, i32 0)
@@ -271,7 +271,7 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_13bit(<4 x i32> inreg) {
 ; GFX12-LABEL: buffer_load_voffset_large_13bit:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], null offset:8188
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 8188, i32 0, i32 0)
@@ -303,7 +303,7 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_16bit(<4 x i32> inreg) {
 ; GFX12-LABEL: buffer_load_voffset_large_16bit:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], null offset:65532
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 65532, i32 0, i32 0)
@@ -335,7 +335,7 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_23bit(<4 x i32> inreg) {
 ; GFX12-LABEL: buffer_load_voffset_large_23bit:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], null offset:8388604
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 8388604, i32 0, i32 0)
@@ -368,7 +368,7 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_24bit(<4 x i32> inreg) {
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; GFX12-NEXT:    buffer_load_b128 v[0:3], v0, s[0:3], null offen offset:8388604
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 16777212, i32 0, i32 0)
@@ -398,7 +398,7 @@ define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %ofs) {
 ; GFX12-LABEL: buffer_load_x1:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0, i32 0)
@@ -427,7 +427,7 @@ define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %ofs) {
 ; GFX12-LABEL: buffer_load_x2:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0, i32 0)
@@ -453,7 +453,7 @@ define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_add_nc_u32_e32 v0, -16, v0
 ; GFX12-NEXT:    buffer_load_b128 v[0:3], v0, s[0:3], null offen
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %ofs.1 = add i32 %ofs, -16
@@ -487,9 +487,9 @@ define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, ptr addrspace(3)
 ; GFX12-NEXT:    buffer_load_b32 v1, off, s[0:3], null
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    ds_store_2addr_b32 v0, v2, v2 offset1:4
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v1
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 entry:
   store float 0.0, ptr addrspace(3) %lds
@@ -537,9 +537,9 @@ define amdgpu_ps void @buffer_load_x1_offen_merged_and(<4 x i32> inreg %rsrc, i3
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    buffer_load_b128 v[1:4], v0, s[0:3], null offen offset:4
 ; GFX12-NEXT:    buffer_load_b64 v[5:6], v0, s[0:3], null offen offset:28
-; GFX12-NEXT:    s_waitcnt vmcnt(1)
+; GFX12-NEXT:    s_wait_loadcnt 0x1
 ; GFX12-NEXT:    export mrt0 v1, v2, v3, v4 done
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    export mrt0 v5, v6, v0, v0 done
 ; GFX12-NEXT:    s_endpgm
 main_body:
@@ -602,9 +602,9 @@ define amdgpu_ps void @buffer_load_x1_offen_merged_or(<4 x i32> inreg %rsrc, i32
 ; GFX12-SDAG-NEXT:    s_clause 0x1
 ; GFX12-SDAG-NEXT:    buffer_load_b128 v[0:3], v4, s[0:3], null offen offset:4
 ; GFX12-SDAG-NEXT:    buffer_load_b64 v[4:5], v4, s[0:3], null offen offset:28
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x1
 ; GFX12-SDAG-NEXT:    export mrt0 v0, v1, v2, v3 done
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    export mrt0 v4, v5, v0, v0 done
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
@@ -624,9 +624,9 @@ define amdgpu_ps void @buffer_load_x1_offen_merged_or(<4 x i32> inreg %rsrc, i32
 ; GFX12-GISEL-NEXT:    buffer_load_b32 v4, v4, s[0:3], null offen
 ; GFX12-GISEL-NEXT:    buffer_load_b32 v5, v5, s[0:3], null offen
 ; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x2
 ; GFX12-GISEL-NEXT:    export mrt0 v1, v2, v3, v4 done
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    export mrt0 v5, v0, v0, v0 done
 ; GFX12-GISEL-NEXT:    s_endpgm
 main_body:
@@ -690,9 +690,9 @@ define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc
 ; GFX12-NEXT:    buffer_load_b64 v[1:2], v0, s[0:3], null offen offset:4
 ; GFX12-NEXT:    buffer_load_b64 v[3:4], v0, s[0:3], null offen offset:12 th:TH_LOAD_NT
 ; GFX12-NEXT:    buffer_load_b64 v[5:6], v0, s[0:3], null offen offset:28 th:TH_LOAD_LU
-; GFX12-NEXT:    s_waitcnt vmcnt(1)
+; GFX12-NEXT:    s_wait_loadcnt 0x1
 ; GFX12-NEXT:    export mrt0 v1, v2, v3, v4 done
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    export mrt0 v5, v6, v0, v0 done
 ; GFX12-NEXT:    s_endpgm
 main_body:
@@ -738,7 +738,7 @@ define amdgpu_ps void @buffer_load_x2_offen_merged_and(<4 x i32> inreg %rsrc, i3
 ; GFX12-LABEL: buffer_load_x2_offen_merged_and:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_b128 v[0:3], v0, s[0:3], null offen offset:4
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    export mrt0 v0, v1, v2, v3 done
 ; GFX12-NEXT:    s_endpgm
 main_body:
@@ -783,7 +783,7 @@ define amdgpu_ps void @buffer_load_x2_offen_merged_or(<4 x i32> inreg %rsrc, i32
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX12-NEXT:    buffer_load_b128 v[0:3], v0, s[0:3], null offen offset:4
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    export mrt0 v0, v1, v2, v3 done
 ; GFX12-NEXT:    s_endpgm
 main_body:
@@ -838,9 +838,9 @@ define amdgpu_ps void @buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) {
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], null offset:4
 ; GFX12-NEXT:    buffer_load_b64 v[4:5], off, s[0:3], null offset:28
-; GFX12-NEXT:    s_waitcnt vmcnt(1)
+; GFX12-NEXT:    s_wait_loadcnt 0x1
 ; GFX12-NEXT:    export mrt0 v0, v1, v2, v3 done
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    export mrt0 v4, v5, v0, v0 done
 ; GFX12-NEXT:    s_endpgm
 main_body:
@@ -880,7 +880,7 @@ define amdgpu_ps void @buffer_load_x2_offset_merged(<4 x i32> inreg %rsrc) {
 ; GFX12-LABEL: buffer_load_x2_offset_merged:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], null offset:4
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    export mrt0 v0, v1, v2, v3 done
 ; GFX12-NEXT:    s_endpgm
 main_body:
@@ -927,7 +927,7 @@ define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(<4 x i32> in
 ; GFX12-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], null
 ; GFX12-NEXT:    buffer_load_b64 v[4:5], off, s[0:3], null th:TH_LOAD_NT
 ; GFX12-NEXT:    buffer_load_b32 v6, off, s[0:3], null th:TH_LOAD_HT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0)
@@ -967,7 +967,7 @@ define amdgpu_ps float @raw_buffer_load_ubyte(<4 x i32> inreg %rsrc) {
 ; GFX12-LABEL: raw_buffer_load_ubyte:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_u8 v0, off, s[0:3], null
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -1002,7 +1002,7 @@ define amdgpu_ps float @raw_buffer_load_i16(<4 x i32> inreg %rsrc) {
 ; GFX12-LABEL: raw_buffer_load_i16:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_u16 v0, off, s[0:3], null
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -1037,7 +1037,7 @@ define amdgpu_ps float @raw_buffer_load_sbyte(<4 x i32> inreg %rsrc) {
 ; GFX12-LABEL: raw_buffer_load_sbyte:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_i8 v0, off, s[0:3], null
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -1072,7 +1072,7 @@ define amdgpu_ps float @raw_buffer_load_sshort(<4 x i32> inreg %rsrc) {
 ; GFX12-LABEL: raw_buffer_load_sshort:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_i16 v0, off, s[0:3], null
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -1108,7 +1108,7 @@ define amdgpu_ps void @raw_buffer_load_f16(<4 x i32> inreg %rsrc, ptr addrspace(
 ; GFX12-LABEL: raw_buffer_load_f16:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_u16 v1, off, s[0:3], null
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ds_store_b16 v0, v1
 ; GFX12-NEXT:    s_endpgm
 main_body:
@@ -1143,7 +1143,7 @@ define amdgpu_ps void @raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, ptr addrspac
 ; GFX12-LABEL: raw_buffer_load_v2f16:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_b32 v1, off, s[0:3], null
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ds_store_b32 v0, v1
 ; GFX12-NEXT:    s_endpgm
 main_body:
@@ -1178,7 +1178,7 @@ define amdgpu_ps void @raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, ptr addrspac
 ; GFX12-LABEL: raw_buffer_load_v4f16:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], null
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ds_store_b64 v0, v[1:2]
 ; GFX12-NEXT:    s_endpgm
 main_body:
@@ -1213,7 +1213,7 @@ define amdgpu_ps void @raw_buffer_load_v2i16(<4 x i32> inreg %rsrc, ptr addrspac
 ; GFX12-LABEL: raw_buffer_load_v2i16:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_b32 v1, off, s[0:3], null
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ds_store_b32 v0, v1
 ; GFX12-NEXT:    s_endpgm
 main_body:
@@ -1248,7 +1248,7 @@ define amdgpu_ps void @raw_buffer_load_v4i16(<4 x i32> inreg %rsrc, ptr addrspac
 ; GFX12-LABEL: raw_buffer_load_v4i16:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], null
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ds_store_b64 v0, v[1:2]
 ; GFX12-NEXT:    s_endpgm
 main_body:
@@ -1295,9 +1295,9 @@ define amdgpu_ps void @raw_buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) {
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], null offset:4
 ; GFX12-NEXT:    buffer_load_b64 v[4:5], off, s[0:3], null offset:28
-; GFX12-NEXT:    s_waitcnt vmcnt(1)
+; GFX12-NEXT:    s_wait_loadcnt 0x1
 ; GFX12-NEXT:    export mrt0 v0, v1, v2, v3 done
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    export mrt0 v4, v5, v0, v0 done
 ; GFX12-NEXT:    s_endpgm
 main_body:
@@ -1362,9 +1362,9 @@ define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> i
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], null offset:4 scope:SCOPE_SE
 ; GFX12-NEXT:    buffer_load_b64 v[4:5], off, s[0:3], null offset:28 scope:SCOPE_SE
-; GFX12-NEXT:    s_waitcnt vmcnt(1)
+; GFX12-NEXT:    s_wait_loadcnt 0x1
 ; GFX12-NEXT:    export mrt0 v0, v1, v2, v3 done
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    export mrt0 v4, v5, v0, v0 done
 ; GFX12-NEXT:    s_endpgm
 main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll
index a3f275fb716ed6..4443a1e0dcb01b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll
@@ -34,7 +34,7 @@ define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) {
 ; GFX12-PACKED-LABEL: tbuffer_load_d16_x:
 ; GFX12-PACKED:       ; %bb.0: ; %main_body
 ; GFX12-PACKED-NEXT:    tbuffer_load_d16_format_x v0, off, s[0:3], null format:[BUF_FMT_32_FLOAT]
-; GFX12-PACKED-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PACKED-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PACKED-NEXT:    ; return to shader part epilog
 main_body:
   %data = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 22, i32 0)
@@ -73,7 +73,7 @@ define amdgpu_ps half @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) {
 ; GFX12-PACKED-LABEL: tbuffer_load_d16_xy:
 ; GFX12-PACKED:       ; %bb.0: ; %main_body
 ; GFX12-PACKED-NEXT:    tbuffer_load_d16_format_xy v0, off, s[0:3], null format:[BUF_FMT_32_FLOAT]
-; GFX12-PACKED-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PACKED-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PACKED-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX12-PACKED-NEXT:    ; return to shader part epilog
 main_body:
@@ -114,7 +114,7 @@ define amdgpu_ps half @tbuffer_load_d16_xyz(<4 x i32> inreg %rsrc) {
 ; GFX12-PACKED-LABEL: tbuffer_load_d16_xyz:
 ; GFX12-PACKED:       ; %bb.0: ; %main_body
 ; GFX12-PACKED-NEXT:    tbuffer_load_d16_format_xyz v[0:1], off, s[0:3], null format:[BUF_FMT_32_FLOAT]
-; GFX12-PACKED-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PACKED-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PACKED-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-PACKED-NEXT:    ; return to shader part epilog
 main_body:
@@ -155,7 +155,7 @@ define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) {
 ; GFX12-PACKED-LABEL: tbuffer_load_d16_xyzw:
 ; GFX12-PACKED:       ; %bb.0: ; %main_body
 ; GFX12-PACKED-NEXT:    tbuffer_load_d16_format_xyzw v[0:1], off, s[0:3], null format:[BUF_FMT_32_FLOAT]
-; GFX12-PACKED-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PACKED-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PACKED-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GFX12-PACKED-NEXT:    ; return to shader part epilog
 main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll
index 7ce5571f59f942..9290feb73a1c4d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll
@@ -43,7 +43,7 @@ define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_l
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[4:7], off, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] th:TH_LOAD_NT
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[8:11], off, s[0:3], null format:[BUF_FMT_32_FLOAT] th:TH_LOAD_HT
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[12:15], off, s[0:3], null format:[BUF_FMT_32_FLOAT] th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
     %vdata     = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 78, i32 0)
@@ -82,7 +82,7 @@ define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) {
 ; GFX12-LABEL: tbuffer_load_immoffs:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[0:3], off, s[0:3], null format:78 offset:42
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
     %vdata   = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 42, i32 0, i32 78, i32 0)
@@ -112,7 +112,7 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(<4 x i32> inreg) {
 ; GFX12-LABEL: buffer_load_voffset_large_12bit:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[0:3], off, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] offset:4092
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %0, i32 4092, i32 0, i32 63, i32 0)
@@ -144,7 +144,7 @@ define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_13bit(<4 x i32> inreg)
 ; GFX12-LABEL: tbuffer_load_voffset_large_13bit:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[0:3], off, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] offset:8188
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %0, i32 8188, i32 0, i32 63, i32 0)
@@ -176,7 +176,7 @@ define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_16bit(<4 x i32> inreg)
 ; GFX12-LABEL: tbuffer_load_voffset_large_16bit:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[0:3], off, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] offset:65532
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %0, i32 65532, i32 0, i32 63, i32 0)
@@ -208,7 +208,7 @@ define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_23bit(<4 x i32> inreg)
 ; GFX12-LABEL: tbuffer_load_voffset_large_23bit:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[0:3], off, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] offset:8388604
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %0, i32 8388604, i32 0, i32 63, i32 0)
@@ -241,7 +241,7 @@ define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_24bit(<4 x i32> inreg)
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[0:3], v0, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] offen offset:8388604
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %0, i32 16777212, i32 0, i32 63, i32 0)
@@ -282,7 +282,7 @@ define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_l
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[0:3], off, s[0:3], s5 format:[BUF_FMT_8_8_8_8_SINT] offset:4095
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[4:7], off, s[0:3], s4 format:[BUF_FMT_32_32_32_32_SINT] offset:73
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[8:11], off, s[0:3], s4 format:77 offset:1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
     %vdata     = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 4095, i32 61, i32 47, i32 0)
     %vdata_glc = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 73, i32 %soffs, i32 62, i32 0)
@@ -318,7 +318,7 @@ define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) {
 ; GFX12-LABEL: tbuffer_load_ofs:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[0:3], v0, s[0:3], null format:78 offen
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
     %vdata   = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 %voffs, i32 0, i32 78, i32 0)
@@ -348,7 +348,7 @@ define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %voffs)
 ; GFX12-LABEL: tbuffer_load_ofs_imm:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[0:3], v0, s[0:3], null format:78 offen offset:52
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
     %ofs = add i32 %voffs, 52
@@ -379,7 +379,7 @@ define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) {
 ; GFX12-LABEL: buffer_load_xy:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    tbuffer_load_format_xy v[0:1], off, s[0:3], null format:77
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
     %vdata = call <2 x i32> @llvm.amdgcn.raw.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 77, i32 0)
     %vdata.f = bitcast <2 x i32> %vdata to <2 x float>
@@ -408,7 +408,7 @@ define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) {
 ; GFX12-LABEL: buffer_load_x:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    tbuffer_load_format_x v0, off, s[0:3], null format:77
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
     %vdata = call i32 @llvm.amdgcn.raw.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 77, i32 0)
     %vdata.f = bitcast i32 %vdata to float
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
index f258b443009aea..59b3776b844814 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
@@ -53,7 +53,7 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) {
 ; GFX12-PACKED-NEXT:    s_clause 0x1
 ; GFX12-PACKED-NEXT:    s_load_b32 s4, s[0:1], 0x34
 ; GFX12-PACKED-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-PACKED-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-PACKED-NEXT:    tbuffer_store_d16_format_x v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
 ; GFX12-PACKED-NEXT:    s_nop 0
@@ -113,7 +113,7 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat
 ; GFX12-PACKED-NEXT:    s_clause 0x1
 ; GFX12-PACKED-NEXT:    s_load_b32 s4, s[0:1], 0x34
 ; GFX12-PACKED-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-PACKED-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-PACKED-NEXT:    tbuffer_store_d16_format_xy v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
 ; GFX12-PACKED-NEXT:    s_nop 0
@@ -181,7 +181,7 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da
 ; GFX12-PACKED-SDAG-NEXT:    s_clause 0x1
 ; GFX12-PACKED-SDAG-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-PACKED-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-PACKED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-PACKED-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PACKED-SDAG-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GFX12-PACKED-SDAG-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-PACKED-SDAG-NEXT:    v_mov_b32_e32 v1, s5
@@ -195,7 +195,7 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da
 ; GFX12-PACKED-GISEL-NEXT:    s_clause 0x1
 ; GFX12-PACKED-GISEL-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-PACKED-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-PACKED-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PACKED-GISEL-NEXT:    s_pack_lh_b32_b16 s4, s4, s4
 ; GFX12-PACKED-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-PACKED-GISEL-NEXT:    v_mov_b32_e32 v0, s4
@@ -266,7 +266,7 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d
 ; GFX12-PACKED-NEXT:    s_clause 0x1
 ; GFX12-PACKED-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
 ; GFX12-PACKED-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-PACKED-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-PACKED-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX12-PACKED-NEXT:    tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
index 818e8eb9463953..ad90a0a81ea956 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
@@ -93,7 +93,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
 ; VARIANT4:       ; %bb.0: ; %entry
 ; VARIANT4-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; VARIANT4-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
-; VARIANT4-NEXT:    s_waitcnt lgkmcnt(0)
+; VARIANT4-NEXT:    s_wait_kmcnt 0x0
 ; VARIANT4-NEXT:    v_xad_u32 v1, v0, -1, s2
 ; VARIANT4-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; VARIANT4-NEXT:    s_barrier_signal -1
@@ -105,7 +105,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
 ; VARIANT4-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; VARIANT4-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, s1, v2, vcc_lo
 ; VARIANT4-NEXT:    global_load_b32 v0, v[1:2], off
-; VARIANT4-NEXT:    s_waitcnt vmcnt(0)
+; VARIANT4-NEXT:    s_wait_loadcnt 0x0
 ; VARIANT4-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; VARIANT4-NEXT:    s_nop 0
 ; VARIANT4-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -115,7 +115,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
 ; VARIANT5:       ; %bb.0: ; %entry
 ; VARIANT5-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; VARIANT5-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
-; VARIANT5-NEXT:    s_waitcnt lgkmcnt(0)
+; VARIANT5-NEXT:    s_wait_kmcnt 0x0
 ; VARIANT5-NEXT:    v_xad_u32 v1, v0, -1, s2
 ; VARIANT5-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; VARIANT5-NEXT:    s_barrier_signal -1
@@ -127,7 +127,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
 ; VARIANT5-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; VARIANT5-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, s1, v2, vcc_lo
 ; VARIANT5-NEXT:    global_load_b32 v0, v[1:2], off
-; VARIANT5-NEXT:    s_waitcnt vmcnt(0)
+; VARIANT5-NEXT:    s_wait_loadcnt 0x0
 ; VARIANT5-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; VARIANT5-NEXT:    s_nop 0
 ; VARIANT5-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -137,7 +137,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
 ; VARIANT6:       ; %bb.0: ; %entry
 ; VARIANT6-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; VARIANT6-NEXT:    v_lshlrev_b32_e32 v5, 2, v0
-; VARIANT6-NEXT:    s_waitcnt lgkmcnt(0)
+; VARIANT6-NEXT:    s_wait_kmcnt 0x0
 ; VARIANT6-NEXT:    s_sub_co_i32 s2, s2, 1
 ; VARIANT6-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
 ; VARIANT6-NEXT:    v_sub_nc_u32_e32 v1, s2, v0
@@ -151,7 +151,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
 ; VARIANT6-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; VARIANT6-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo
 ; VARIANT6-NEXT:    global_load_b32 v0, v[1:2], off
-; VARIANT6-NEXT:    s_waitcnt vmcnt(0)
+; VARIANT6-NEXT:    s_wait_loadcnt 0x0
 ; VARIANT6-NEXT:    global_store_b32 v5, v0, s[0:1]
 ; VARIANT6-NEXT:    s_nop 0
 ; VARIANT6-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
index 220002ce4f6c45..00caa2a36c58da 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
@@ -10,7 +10,7 @@ define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 {
 ; GCN-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GCN-NEXT:    s_barrier_signal -1
 ; GCN-NEXT:    s_barrier_wait -1
@@ -26,7 +26,7 @@ define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 {
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal -1
 ; GLOBAL-ISEL-NEXT:    s_barrier_wait -1
@@ -54,7 +54,7 @@ define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 {
 ; GCN-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GCN-NEXT:    s_barrier_signal 1
 ; GCN-NEXT:    s_barrier_wait 1
@@ -70,7 +70,7 @@ define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 {
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal 1
 ; GLOBAL-ISEL-NEXT:    s_barrier_wait 1
@@ -98,7 +98,7 @@ define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 {
 ; GCN-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GCN-NEXT:    s_barrier_signal 0
 ; GCN-NEXT:    s_barrier_wait 0
@@ -114,7 +114,7 @@ define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 {
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal 0
 ; GLOBAL-ISEL-NEXT:    s_barrier_wait 0
@@ -144,7 +144,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0
 ; GCN-NEXT:    s_mov_b32 m0, 1
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v3, v1, s[0:1]
 ; GCN-NEXT:    s_barrier_signal m0
 ; GCN-NEXT:    s_barrier_wait 1
@@ -161,7 +161,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0
 ; GLOBAL-ISEL-NEXT:    s_mov_b32 m0, 1
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal m0
 ; GLOBAL-ISEL-NEXT:    s_barrier_wait 1
@@ -184,7 +184,11 @@ entry:
 define void @test2_s_barrier_signal_var(i32 %arg) {
 ; GCN-LABEL: test2_s_barrier_signal_var:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_expcnt 0x0
+; GCN-NEXT:    s_wait_samplecnt 0x0
+; GCN-NEXT:    s_wait_bvhcnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_readfirstlane_b32 s0, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GCN-NEXT:    s_mov_b32 m0, s0
@@ -193,7 +197,11 @@ define void @test2_s_barrier_signal_var(i32 %arg) {
 ;
 ; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_var:
 ; GLOBAL-ISEL:       ; %bb.0:
-; GLOBAL-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GLOBAL-ISEL-NEXT:    s_wait_expcnt 0x0
+; GLOBAL-ISEL-NEXT:    s_wait_samplecnt 0x0
+; GLOBAL-ISEL-NEXT:    s_wait_bvhcnt 0x0
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    v_readfirstlane_b32 m0, v0
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal m0
 ; GLOBAL-ISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -206,7 +214,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
 ; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v0, v1, s[6:7]
 ; GCN-NEXT:    s_barrier_signal_isfirst -1
 ; GCN-NEXT:    s_cselect_b32 s3, s3, s5
@@ -214,7 +222,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GCN-NEXT:    s_clause 0x1
 ; GCN-NEXT:    global_load_b32 v2, v1, s[0:1]
 ; GCN-NEXT:    global_load_b32 v1, v1, s[2:3]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GCN-NEXT:    global_store_b32 v0, v1, s[6:7]
 ; GCN-NEXT:    s_nop 0
@@ -225,7 +233,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GLOBAL-ISEL:       ; %bb.0: ; %entry
 ; GLOBAL-ISEL-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal_isfirst -1
 ; GLOBAL-ISEL-NEXT:    s_cselect_b32 s8, 1, 0
@@ -236,7 +244,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GLOBAL-ISEL-NEXT:    s_clause 0x1
 ; GLOBAL-ISEL-NEXT:    global_load_b32 v2, v1, s[0:1]
 ; GLOBAL-ISEL-NEXT:    global_load_b32 v1, v1, s[2:3]
-; GLOBAL-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_loadcnt 0x0
 ; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
 ; GLOBAL-ISEL-NEXT:    s_nop 0
@@ -260,7 +268,7 @@ define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
 ; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v0, v1, s[6:7]
 ; GCN-NEXT:    s_barrier_signal_isfirst 1
 ; GCN-NEXT:    s_cselect_b32 s3, s3, s5
@@ -268,7 +276,7 @@ define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GCN-NEXT:    s_clause 0x1
 ; GCN-NEXT:    global_load_b32 v2, v1, s[0:1]
 ; GCN-NEXT:    global_load_b32 v1, v1, s[2:3]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GCN-NEXT:    global_store_b32 v0, v1, s[6:7]
 ; GCN-NEXT:    s_nop 0
@@ -279,7 +287,7 @@ define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GLOBAL-ISEL:       ; %bb.0: ; %entry
 ; GLOBAL-ISEL-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal_isfirst 1
 ; GLOBAL-ISEL-NEXT:    s_cselect_b32 s8, 1, 0
@@ -290,7 +298,7 @@ define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GLOBAL-ISEL-NEXT:    s_clause 0x1
 ; GLOBAL-ISEL-NEXT:    global_load_b32 v2, v1, s[0:1]
 ; GLOBAL-ISEL-NEXT:    global_load_b32 v1, v1, s[2:3]
-; GLOBAL-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_loadcnt 0x0
 ; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
 ; GLOBAL-ISEL-NEXT:    s_nop 0
@@ -314,7 +322,7 @@ define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
 ; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v0, v1, s[6:7]
 ; GCN-NEXT:    s_barrier_signal_isfirst 1
 ; GCN-NEXT:    s_cselect_b32 s3, s3, s5
@@ -322,7 +330,7 @@ define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GCN-NEXT:    s_clause 0x1
 ; GCN-NEXT:    global_load_b32 v2, v1, s[0:1]
 ; GCN-NEXT:    global_load_b32 v1, v1, s[2:3]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GCN-NEXT:    global_store_b32 v0, v1, s[6:7]
 ; GCN-NEXT:    s_nop 0
@@ -333,7 +341,7 @@ define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GLOBAL-ISEL:       ; %bb.0: ; %entry
 ; GLOBAL-ISEL-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal_isfirst 1
 ; GLOBAL-ISEL-NEXT:    s_cselect_b32 s8, 1, 0
@@ -344,7 +352,7 @@ define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GLOBAL-ISEL-NEXT:    s_clause 0x1
 ; GLOBAL-ISEL-NEXT:    global_load_b32 v2, v1, s[0:1]
 ; GLOBAL-ISEL-NEXT:    global_load_b32 v1, v1, s[2:3]
-; GLOBAL-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_loadcnt 0x0
 ; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
 ; GLOBAL-ISEL-NEXT:    s_nop 0
@@ -369,7 +377,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) %
 ; GCN-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
 ; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GCN-NEXT:    s_mov_b32 m0, 1
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v0, v1, s[6:7]
 ; GCN-NEXT:    s_barrier_signal_isfirst m0
 ; GCN-NEXT:    s_cselect_b32 s3, s3, s5
@@ -377,7 +385,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) %
 ; GCN-NEXT:    s_clause 0x1
 ; GCN-NEXT:    global_load_b32 v2, v1, s[0:1]
 ; GCN-NEXT:    global_load_b32 v1, v1, s[2:3]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GCN-NEXT:    global_store_b32 v0, v1, s[6:7]
 ; GCN-NEXT:    s_nop 0
@@ -389,7 +397,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) %
 ; GLOBAL-ISEL-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_mov_b32 m0, 1
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal_isfirst m0
 ; GLOBAL-ISEL-NEXT:    s_cselect_b32 s8, 1, 0
@@ -400,7 +408,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) %
 ; GLOBAL-ISEL-NEXT:    s_clause 0x1
 ; GLOBAL-ISEL-NEXT:    global_load_b32 v2, v1, s[0:1]
 ; GLOBAL-ISEL-NEXT:    global_load_b32 v1, v1, s[2:3]
-; GLOBAL-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_loadcnt 0x0
 ; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
 ; GLOBAL-ISEL-NEXT:    s_nop 0
@@ -422,7 +430,11 @@ entry:
 define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, i32 %arg, ptr addrspace(1) %out) {
 ; GCN-LABEL: test2_s_barrier_signal_isfirst_var:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_expcnt 0x0
+; GCN-NEXT:    s_wait_samplecnt 0x0
+; GCN-NEXT:    s_wait_bvhcnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v31
 ; GCN-NEXT:    v_readfirstlane_b32 s0, v6
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -437,15 +449,19 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa
 ; GCN-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3
 ; GCN-NEXT:    global_load_b32 v0, v[0:1], off
 ; GCN-NEXT:    global_load_b32 v1, v[2:3], off
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GCN-NEXT:    global_store_b32 v[7:8], v0, off
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_isfirst_var:
 ; GLOBAL-ISEL:       ; %bb.0:
-; GLOBAL-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GLOBAL-ISEL-NEXT:    s_wait_expcnt 0x0
+; GLOBAL-ISEL-NEXT:    s_wait_samplecnt 0x0
+; GLOBAL-ISEL-NEXT:    s_wait_bvhcnt 0x0
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    v_and_b32_e32 v9, 0x3ff, v31
 ; GLOBAL-ISEL-NEXT:    v_readfirstlane_b32 m0, v6
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -462,10 +478,10 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa
 ; GLOBAL-ISEL-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3
 ; GLOBAL-ISEL-NEXT:    global_load_b32 v0, v[0:1], off
 ; GLOBAL-ISEL-NEXT:    global_load_b32 v1, v[2:3], off
-; GLOBAL-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_loadcnt 0x0
 ; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v[7:8], v0, off
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_setpc_b64 s[30:31]
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -487,7 +503,7 @@ define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC
 ; GCN-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_lshl_b32 s2, s2, 16
 ; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GCN-NEXT:    s_mov_b32 m0, s2
@@ -504,7 +520,7 @@ define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_lshl_b32 m0, 16, s2
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_barrier_init -1
@@ -531,7 +547,7 @@ define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC
 ; GCN-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_lshl_b32 s2, s2, 16
 ; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GCN-NEXT:    s_mov_b32 m0, s2
@@ -548,7 +564,7 @@ define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_lshl_b32 m0, 16, s2
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_barrier_init 1
@@ -575,7 +591,7 @@ define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC
 ; GCN-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_lshl_b32 s2, s2, 16
 ; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GCN-NEXT:    s_mov_b32 m0, s2
@@ -592,7 +608,7 @@ define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_lshl_b32 m0, 16, s2
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_barrier_init 0
@@ -619,7 +635,7 @@ define amdgpu_kernel void @test4_s_barrier_init(ptr addrspace(1) %out, i32 %bar,
 ; GCN-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_lshl_b32 s3, s3, 16
 ; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GCN-NEXT:    s_or_b32 s2, s2, s3
@@ -637,7 +653,7 @@ define amdgpu_kernel void @test4_s_barrier_init(ptr addrspace(1) %out, i32 %bar,
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_lshl_b32 s3, 16, s3
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_or_b32 m0, s2, s3
@@ -660,7 +676,11 @@ entry:
 define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) {
 ; GCN-LABEL: test5_s_barrier_init_m0:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_expcnt 0x0
+; GCN-NEXT:    s_wait_samplecnt 0x0
+; GCN-NEXT:    s_wait_bvhcnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -672,7 +692,11 @@ define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) {
 ;
 ; GLOBAL-ISEL-LABEL: test5_s_barrier_init_m0:
 ; GLOBAL-ISEL:       ; %bb.0:
-; GLOBAL-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GLOBAL-ISEL-NEXT:    s_wait_expcnt 0x0
+; GLOBAL-ISEL-NEXT:    s_wait_samplecnt 0x0
+; GLOBAL-ISEL-NEXT:    s_wait_bvhcnt 0x0
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    v_readfirstlane_b32 s0, v1
 ; GLOBAL-ISEL-NEXT:    v_readfirstlane_b32 s1, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -694,7 +718,7 @@ define amdgpu_kernel void @test1_s_barrier_join(ptr addrspace(1) %out) #0 {
 ; GCN-NEXT:    s_barrier_join -1
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -707,7 +731,7 @@ define amdgpu_kernel void @test1_s_barrier_join(ptr addrspace(1) %out) #0 {
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_barrier_join -1
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
@@ -734,7 +758,7 @@ define amdgpu_kernel void @test2_s_barrier_join(ptr addrspace(1) %out) #0 {
 ; GCN-NEXT:    s_barrier_join 1
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -747,7 +771,7 @@ define amdgpu_kernel void @test2_s_barrier_join(ptr addrspace(1) %out) #0 {
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_barrier_join 1
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
@@ -774,7 +798,7 @@ define amdgpu_kernel void @test3_s_barrier_join(ptr addrspace(1) %out) #0 {
 ; GCN-NEXT:    s_barrier_join 0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -787,7 +811,7 @@ define amdgpu_kernel void @test3_s_barrier_join(ptr addrspace(1) %out) #0 {
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_barrier_join 0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
@@ -814,7 +838,7 @@ define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %b
 ; GCN-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_mov_b32 m0, s2
 ; GCN-NEXT:    global_store_b32 v3, v1, s[0:1]
 ; GCN-NEXT:    s_barrier_join m0
@@ -830,7 +854,7 @@ define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %b
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_mov_b32 m0, s2
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_barrier_join m0
@@ -852,7 +876,11 @@ entry:
 define void @test5_s_barrier_join_m0(i32 %arg) {
 ; GCN-LABEL: test5_s_barrier_join_m0:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_expcnt 0x0
+; GCN-NEXT:    s_wait_samplecnt 0x0
+; GCN-NEXT:    s_wait_bvhcnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_readfirstlane_b32 s0, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GCN-NEXT:    s_mov_b32 m0, s0
@@ -861,7 +889,11 @@ define void @test5_s_barrier_join_m0(i32 %arg) {
 ;
 ; GLOBAL-ISEL-LABEL: test5_s_barrier_join_m0:
 ; GLOBAL-ISEL:       ; %bb.0:
-; GLOBAL-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GLOBAL-ISEL-NEXT:    s_wait_expcnt 0x0
+; GLOBAL-ISEL-NEXT:    s_wait_samplecnt 0x0
+; GLOBAL-ISEL-NEXT:    s_wait_bvhcnt 0x0
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    v_readfirstlane_b32 m0, v0
 ; GLOBAL-ISEL-NEXT:    s_barrier_join m0
 ; GLOBAL-ISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -874,7 +906,7 @@ define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrsp
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
 ; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v0, v1, s[6:7]
 ; GCN-NEXT:    s_barrier_leave
 ; GCN-NEXT:    s_cselect_b32 s3, s3, s5
@@ -882,7 +914,7 @@ define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrsp
 ; GCN-NEXT:    s_clause 0x1
 ; GCN-NEXT:    global_load_b32 v2, v1, s[0:1]
 ; GCN-NEXT:    global_load_b32 v1, v1, s[2:3]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GCN-NEXT:    global_store_b32 v0, v1, s[6:7]
 ; GCN-NEXT:    s_nop 0
@@ -893,7 +925,7 @@ define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrsp
 ; GLOBAL-ISEL:       ; %bb.0: ; %entry
 ; GLOBAL-ISEL-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
 ; GLOBAL-ISEL-NEXT:    s_barrier_leave
 ; GLOBAL-ISEL-NEXT:    s_cselect_b32 s8, 1, 0
@@ -904,7 +936,7 @@ define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrsp
 ; GLOBAL-ISEL-NEXT:    s_clause 0x1
 ; GLOBAL-ISEL-NEXT:    global_load_b32 v2, v1, s[0:1]
 ; GLOBAL-ISEL-NEXT:    global_load_b32 v1, v1, s[2:3]
-; GLOBAL-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_loadcnt 0x0
 ; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
 ; GLOBAL-ISEL-NEXT:    s_nop 0
@@ -932,7 +964,7 @@ define amdgpu_kernel void @test1_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
 ; GCN-NEXT:    s_wakeup_barrier -1
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -945,7 +977,7 @@ define amdgpu_kernel void @test1_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_wakeup_barrier -1
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
@@ -972,7 +1004,7 @@ define amdgpu_kernel void @test2_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
 ; GCN-NEXT:    s_wakeup_barrier 1
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -985,7 +1017,7 @@ define amdgpu_kernel void @test2_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_wakeup_barrier 1
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
@@ -1012,7 +1044,7 @@ define amdgpu_kernel void @test3_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
 ; GCN-NEXT:    s_wakeup_barrier 0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1025,7 +1057,7 @@ define amdgpu_kernel void @test3_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_wakeup_barrier 0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
@@ -1052,7 +1084,7 @@ define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32
 ; GCN-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_mov_b32 m0, s2
 ; GCN-NEXT:    global_store_b32 v3, v1, s[0:1]
 ; GCN-NEXT:    s_wakeup_barrier m0
@@ -1068,7 +1100,7 @@ define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_mov_b32 m0, s2
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_wakeup_barrier m0
@@ -1090,7 +1122,11 @@ entry:
 define void @test5_s_wakeup_barrier_m0(i32 %arg) {
 ; GCN-LABEL: test5_s_wakeup_barrier_m0:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_expcnt 0x0
+; GCN-NEXT:    s_wait_samplecnt 0x0
+; GCN-NEXT:    s_wait_bvhcnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_readfirstlane_b32 s0, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GCN-NEXT:    s_mov_b32 m0, s0
@@ -1099,7 +1135,11 @@ define void @test5_s_wakeup_barrier_m0(i32 %arg) {
 ;
 ; GLOBAL-ISEL-LABEL: test5_s_wakeup_barrier_m0:
 ; GLOBAL-ISEL:       ; %bb.0:
-; GLOBAL-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GLOBAL-ISEL-NEXT:    s_wait_expcnt 0x0
+; GLOBAL-ISEL-NEXT:    s_wait_samplecnt 0x0
+; GLOBAL-ISEL-NEXT:    s_wait_bvhcnt 0x0
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    v_readfirstlane_b32 m0, v0
 ; GLOBAL-ISEL-NEXT:    s_wakeup_barrier m0
 ; GLOBAL-ISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1112,7 +1152,7 @@ define amdgpu_kernel void @test1_s_get_barrier_state(ptr addrspace(1) %out) #0 {
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_get_barrier_state s2, -1
 ; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GCN-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1124,10 +1164,10 @@ define amdgpu_kernel void @test1_s_get_barrier_state(ptr addrspace(1) %out) #0 {
 ; GLOBAL-ISEL:       ; %bb.0: ; %entry
 ; GLOBAL-ISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_get_barrier_state s2, -1
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
 ; GLOBAL-ISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1148,7 +1188,7 @@ define amdgpu_kernel void @test2_s_get_barrier_state(ptr addrspace(1) %out) #0 {
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_get_barrier_state s2, 1
 ; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GCN-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1160,10 +1200,10 @@ define amdgpu_kernel void @test2_s_get_barrier_state(ptr addrspace(1) %out) #0 {
 ; GLOBAL-ISEL:       ; %bb.0: ; %entry
 ; GLOBAL-ISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_get_barrier_state s2, 1
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
 ; GLOBAL-ISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1184,7 +1224,7 @@ define amdgpu_kernel void @test3_s_get_barrier_state(ptr addrspace(1) %out) #0 {
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_get_barrier_state s2, 0
 ; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GCN-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1196,10 +1236,10 @@ define amdgpu_kernel void @test3_s_get_barrier_state(ptr addrspace(1) %out) #0 {
 ; GLOBAL-ISEL:       ; %bb.0: ; %entry
 ; GLOBAL-ISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_get_barrier_state s2, 0
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
 ; GLOBAL-ISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1220,11 +1260,11 @@ define amdgpu_kernel void @test4_s_get_barrier_state_m0(ptr addrspace(1) %out, i
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_mov_b32 m0, s2
 ; GCN-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GCN-NEXT:    s_get_barrier_state s2, m0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
 ; GCN-NEXT:    v_mov_b32_e32 v1, s2
 ; GCN-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1236,11 +1276,11 @@ define amdgpu_kernel void @test4_s_get_barrier_state_m0(ptr addrspace(1) %out, i
 ; GLOBAL-ISEL:       ; %bb.0: ; %entry
 ; GLOBAL-ISEL-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_mov_b32 m0, s2
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_get_barrier_state s2, m0
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
 ; GLOBAL-ISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1259,21 +1299,29 @@ entry:
 define i32 @test5_s_get_barrier_state_m0(i32 %arg) {
 ; GCN-LABEL: test5_s_get_barrier_state_m0:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_expcnt 0x0
+; GCN-NEXT:    s_wait_samplecnt 0x0
+; GCN-NEXT:    s_wait_bvhcnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_readfirstlane_b32 s0, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_2)
 ; GCN-NEXT:    s_mov_b32 m0, s0
 ; GCN-NEXT:    s_get_barrier_state s0, m0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GLOBAL-ISEL-LABEL: test5_s_get_barrier_state_m0:
 ; GLOBAL-ISEL:       ; %bb.0:
-; GLOBAL-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GLOBAL-ISEL-NEXT:    s_wait_expcnt 0x0
+; GLOBAL-ISEL-NEXT:    s_wait_samplecnt 0x0
+; GLOBAL-ISEL-NEXT:    s_wait_bvhcnt 0x0
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    v_readfirstlane_b32 m0, v0
 ; GLOBAL-ISEL-NEXT:    s_get_barrier_state s0, m0
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
 ; GLOBAL-ISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GLOBAL-ISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1289,7 +1337,7 @@ define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 {
 ; GCN-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GCN-NEXT:    s_barrier_signal -1
 ; GCN-NEXT:    s_barrier_wait -1
@@ -1305,7 +1353,7 @@ define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 {
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal -1
 ; GLOBAL-ISEL-NEXT:    s_barrier_wait -1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
index 7dffd1a75ed0ec..71eabf2d4473dd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
@@ -35,7 +35,7 @@ define amdgpu_ps void @s_buffer_load_imm(<4 x i32> inreg %desc) {
 ; GFX12-LABEL: s_buffer_load_imm:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    s_buffer_load_b32 s0, s[0:3], 0x4
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    export mrt0 v0, v0, v0, v0 done
 ; GFX12-NEXT:    s_endpgm
@@ -74,7 +74,7 @@ define amdgpu_ps void @s_buffer_load_index(<4 x i32> inreg %desc, i32 inreg %ind
 ; GFX12-LABEL: s_buffer_load_index:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    export mrt0 v0, v0, v0, v0 done
 ; GFX12-NEXT:    s_endpgm
@@ -103,7 +103,7 @@ define amdgpu_ps void @s_buffer_load_index_divergent(<4 x i32> inreg %desc, i32
 ; GFX12-LABEL: s_buffer_load_index_divergent:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    export mrt0 v0, v0, v0, v0 done
 ; GFX12-NEXT:    s_endpgm
 main_body:
@@ -144,7 +144,7 @@ define amdgpu_ps void @s_buffer_loadx2_imm(<4 x i32> inreg %desc) {
 ; GFX12-LABEL: s_buffer_loadx2_imm:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    s_buffer_load_b64 s[0:1], s[0:3], 0x40
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-NEXT:    export mrt0 v0, v1, v0, v0 done
@@ -189,7 +189,7 @@ define amdgpu_ps void @s_buffer_loadx2_index(<4 x i32> inreg %desc, i32 inreg %i
 ; GFX12-LABEL: s_buffer_loadx2_index:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    s_buffer_load_b64 s[0:1], s[0:3], s4 offset:0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-NEXT:    export mrt0 v0, v1, v0, v0 done
@@ -221,7 +221,7 @@ define amdgpu_ps void @s_buffer_loadx2_index_divergent(<4 x i32> inreg %desc, i3
 ; GFX12-LABEL: s_buffer_loadx2_index_divergent:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    export mrt0 v0, v1, v0, v0 done
 ; GFX12-NEXT:    s_endpgm
 main_body:
@@ -267,7 +267,7 @@ define amdgpu_ps void @s_buffer_loadx3_imm(<4 x i32> inreg %desc) {
 ; GFX12-LABEL: s_buffer_loadx3_imm:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    s_buffer_load_b96 s[0:2], s[0:3], 0x40
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
@@ -317,7 +317,7 @@ define amdgpu_ps void @s_buffer_loadx3_index(<4 x i32> inreg %desc, i32 inreg %i
 ; GFX12-LABEL: s_buffer_loadx3_index:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    s_buffer_load_b96 s[0:2], s[0:3], s4 offset:0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
@@ -358,7 +358,7 @@ define amdgpu_ps void @s_buffer_loadx3_index_divergent(<4 x i32> inreg %desc, i3
 ; GFX12-LABEL: s_buffer_loadx3_index_divergent:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_b96 v[0:2], v0, s[0:3], null offen
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    export mrt0 v0, v1, v2, v0 done
 ; GFX12-NEXT:    s_endpgm
 main_body:
@@ -408,7 +408,7 @@ define amdgpu_ps void @s_buffer_loadx4_imm(<4 x i32> inreg %desc) {
 ; GFX12-LABEL: s_buffer_loadx4_imm:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    s_buffer_load_b128 s[0:3], s[0:3], 0xc8
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
@@ -463,7 +463,7 @@ define amdgpu_ps void @s_buffer_loadx4_index(<4 x i32> inreg %desc, i32 inreg %i
 ; GFX12-LABEL: s_buffer_loadx4_index:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    s_buffer_load_b128 s[0:3], s[0:3], s4 offset:0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
@@ -499,7 +499,7 @@ define amdgpu_ps void @s_buffer_loadx4_index_divergent(<4 x i32> inreg %desc, i3
 ; GFX12-LABEL: s_buffer_loadx4_index_divergent:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_b128 v[0:3], v0, s[0:3], null offen
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    export mrt0 v0, v1, v2, v3 done
 ; GFX12-NEXT:    s_endpgm
 main_body:
@@ -544,7 +544,7 @@ define amdgpu_ps void @s_buffer_load_imm_mergex2(<4 x i32> inreg %desc) {
 ; GFX12-LABEL: s_buffer_load_imm_mergex2:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    s_buffer_load_b64 s[0:1], s[0:3], 0x4
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-NEXT:    export mrt0 v0, v1, v0, v0 done
@@ -595,7 +595,7 @@ define amdgpu_ps void @s_buffer_load_imm_mergex4(<4 x i32> inreg %desc) {
 ; GFX12-LABEL: s_buffer_load_imm_mergex4:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    s_buffer_load_b128 s[0:3], s[0:3], 0x8
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
@@ -728,11 +728,11 @@ define amdgpu_ps void @s_buffer_load_index_across_bb(<4 x i32> inreg %desc, i32
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX12-NEXT:    v_or_b32_e32 v0, 8, v0
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    export mrt0 v0, v0, v0, v0 done
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -771,7 +771,7 @@ define amdgpu_ps void @s_buffer_load_index_across_bb_merged(<4 x i32> inreg %des
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:8
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    export mrt0 v0, v1, v0, v0 done
 ; GFX12-NEXT:    s_endpgm
 main_body:
@@ -823,7 +823,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_neg1(<4 x i32> inreg %desc) {
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -1, i32 0)
   ret i32 %load
@@ -869,7 +869,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_neg4(<4 x i32> inreg %desc) {
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_mov_b32 s4, -4
 ; GFX12-NEXT:    s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -4, i32 0)
   ret i32 %load
@@ -915,7 +915,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_neg8(<4 x i32> inreg %desc) {
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_mov_b32 s4, -8
 ; GFX12-NEXT:    s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -8, i32 0)
   ret i32 %load
@@ -961,7 +961,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_bit31(<4 x i32> inreg %desc) {
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_brev_b32 s4, 1
 ; GFX12-NEXT:    s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -2147483648, i32 0)
   ret i32 %load
@@ -1007,7 +1007,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_bit30(<4 x i32> inreg %desc) {
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_mov_b32 s4, 2.0
 ; GFX12-NEXT:    s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1073741824, i32 0)
   ret i32 %load
@@ -1053,7 +1053,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_bit29(<4 x i32> inreg %desc) {
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_brev_b32 s4, 4
 ; GFX12-NEXT:    s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 536870912, i32 0)
   ret i32 %load
@@ -1098,7 +1098,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_bit21(<4 x i32> inreg %desc) {
 ; GFX12-LABEL: s_buffer_load_imm_bit21:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_buffer_load_b32 s0, s[0:3], 0x200000
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 2097152, i32 0)
   ret i32 %load
@@ -1143,7 +1143,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_bit20(<4 x i32> inreg %desc) {
 ; GFX12-LABEL: s_buffer_load_imm_bit20:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_buffer_load_b32 s0, s[0:3], 0x100000
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1048576, i32 0)
   ret i32 %load
@@ -1189,7 +1189,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_neg_bit20(<4 x i32> inreg %desc) {
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_mov_b32 s4, 0xfff00000
 ; GFX12-NEXT:    s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32  -1048576, i32 0)
   ret i32 %load
@@ -1225,7 +1225,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_bit19(<4 x i32> inreg %desc) {
 ; GFX12-LABEL: s_buffer_load_imm_bit19:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_buffer_load_b32 s0, s[0:3], 0x80000
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 524288, i32 0)
   ret i32 %load
@@ -1271,7 +1271,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_neg_bit19(<4 x i32> inreg %desc) {
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_mov_b32 s4, 0xfff80000
 ; GFX12-NEXT:    s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -524288, i32 0)
   ret i32 %load
@@ -1308,7 +1308,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_255(<4 x i32> inreg %desc) {
 ; GFX12-LABEL: s_buffer_load_imm_255:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_buffer_load_b32 s0, s[0:3], 0xff
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 255, i32 0)
   ret i32 %load
@@ -1336,7 +1336,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_256(<4 x i32> inreg %desc) {
 ; GFX12-LABEL: s_buffer_load_imm_256:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_buffer_load_b32 s0, s[0:3], 0x100
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 256, i32 0)
   ret i32 %load
@@ -1364,7 +1364,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_1016(<4 x i32> inreg %desc) {
 ; GFX12-LABEL: s_buffer_load_imm_1016:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_buffer_load_b32 s0, s[0:3], 0x3f8
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1016, i32 0)
   ret i32 %load
@@ -1392,7 +1392,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_1020(<4 x i32> inreg %desc) {
 ; GFX12-LABEL: s_buffer_load_imm_1020:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_buffer_load_b32 s0, s[0:3], 0x3fc
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1020, i32 0)
   ret i32 %load
@@ -1429,7 +1429,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_1021(<4 x i32> inreg %desc) {
 ; GFX12-LABEL: s_buffer_load_imm_1021:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_buffer_load_b32 s0, s[0:3], 0x3fd
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1021, i32 0)
   ret i32 %load
@@ -1465,7 +1465,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_1024(<4 x i32> inreg %desc) {
 ; GFX12-LABEL: s_buffer_load_imm_1024:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_buffer_load_b32 s0, s[0:3], 0x400
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1024, i32 0)
   ret i32 %load
@@ -1502,7 +1502,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_1025(<4 x i32> inreg %desc) {
 ; GFX12-LABEL: s_buffer_load_imm_1025:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_buffer_load_b32 s0, s[0:3], 0x401
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1025, i32 0)
   ret i32 %load
@@ -1538,7 +1538,7 @@ define amdgpu_ps i32 @s_buffer_load_imm_1028(<4 x i32> inreg %desc) {
 ; GFX12-LABEL: s_buffer_load_imm_1028:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_buffer_load_b32 s0, s[0:3], 0x400
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
   %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1024, i32 0)
   ret i32 %load
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll
index 5ad7ddfbe5fe9d..4294cea9109f0b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll
@@ -7,7 +7,11 @@ declare void @llvm.amdgcn.s.sleep.var(i32)
 define void @test_s_sleep_var1(i32 %arg) {
 ; GCN-LABEL: test_s_sleep_var1:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_expcnt 0x0
+; GCN-NEXT:    s_wait_samplecnt 0x0
+; GCN-NEXT:    s_wait_bvhcnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_readfirstlane_b32 s0, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GCN-NEXT:    s_sleep_var s0
@@ -19,7 +23,11 @@ define void @test_s_sleep_var1(i32 %arg) {
 define void @test_s_sleep_var2() {
 ; GCN-LABEL: test_s_sleep_var2:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_expcnt 0x0
+; GCN-NEXT:    s_wait_samplecnt 0x0
+; GCN-NEXT:    s_wait_bvhcnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_sleep_var 10
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   call void @llvm.amdgcn.s.sleep.var(i32 10)
@@ -30,7 +38,7 @@ define amdgpu_kernel void @test_s_sleep_var3(i32 %arg) {
 ; GCN-LABEL: test_s_sleep_var3:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_sleep_var s0
 ; GCN-NEXT:    s_endpgm
   call void @llvm.amdgcn.s.sleep.var(i32 %arg)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll
index d570dada4f1ab1..9dd11699e5dcf2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll
@@ -41,7 +41,7 @@ define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32>
 ; GFX12-NEXT:    buffer_load_format_xyzw v[0:3], v8, s[0:3], null idxen
 ; GFX12-NEXT:    buffer_load_format_xyzw v[4:7], v8, s[0:3], null idxen th:TH_LOAD_NT
 ; GFX12-NEXT:    buffer_load_format_xyzw v[8:11], v8, s[0:3], null idxen th:TH_LOAD_HT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0)
@@ -79,7 +79,7 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    buffer_load_format_xyzw v[0:3], v0, s[0:3], null idxen offset:42
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 42, i32 0, i32 0)
@@ -155,11 +155,11 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
 ; GFX12-NEXT:    buffer_load_format_xyzw v[0:3], v8, s[0:3], s4 idxen offset:4092
 ; GFX12-NEXT:    buffer_load_format_xyzw v[4:7], v8, s[0:3], s5 idxen offset:4092
 ; GFX12-NEXT:    s_mov_b32 s4, 0x8ffc
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_add_f32_e32 v1, v1, v5
 ; GFX12-NEXT:    buffer_load_format_xyzw v[8:11], v8, s[0:3], s4 idxen offset:4
 ; GFX12-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v3, v3, v7
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v1, v9, v1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_dual_add_f32 v0, v8, v0 :: v_dual_add_f32 v3, v11, v3
@@ -200,7 +200,7 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(<4 x i32> inreg) {
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    buffer_load_format_xyzw v[0:3], v0, s[0:3], null idxen offset:4092
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4092, i32 0, i32 0)
@@ -239,7 +239,7 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_13bit(<4 x i32> inreg) {
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    buffer_load_format_xyzw v[0:3], v0, s[0:3], null idxen offset:8188
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 8188, i32 0, i32 0)
@@ -278,7 +278,7 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_16bit(<4 x i32> inreg) {
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    buffer_load_format_xyzw v[0:3], v0, s[0:3], null idxen offset:65532
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 65532, i32 0, i32 0)
@@ -317,7 +317,7 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_23bit(<4 x i32> inreg) {
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    buffer_load_format_xyzw v[0:3], v0, s[0:3], null idxen offset:8388604
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 8388604, i32 0, i32 0)
@@ -356,14 +356,14 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_24bit(<4 x i32> inreg) {
 ; GFX12-SDAG:       ; %bb.0: ; %main_body
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 0x800000 :: v_dual_mov_b32 v0, 0
 ; GFX12-SDAG-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], null idxen offen offset:8388604
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: buffer_load_voffset_large_24bit:
 ; GFX12-GISEL:       ; %bb.0: ; %main_body
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x800000
 ; GFX12-GISEL-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], null idxen offen offset:8388604
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 16777212, i32 0, i32 0)
@@ -392,7 +392,7 @@ define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) {
 ; GFX12-LABEL: buffer_load_idx:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_format_xyzw v[0:3], v0, s[0:3], null idxen
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0, i32 0)
@@ -431,7 +431,7 @@ define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) {
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0
 ; GFX12-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], null idxen offen
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %1, i32 0, i32 0)
@@ -470,7 +470,7 @@ define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0
 ; GFX12-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], null idxen offen offset:60
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %ofs = add i32 %1, 60
@@ -500,7 +500,7 @@ define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) {
 ; GFX12-LABEL: buffer_load_both:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], null idxen offen
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 %2, i32 0, i32 0)
@@ -533,7 +533,7 @@ define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i3
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX12-NEXT:    buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], null idxen offen
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 %2, i32 %1, i32 0, i32 0)
@@ -566,7 +566,7 @@ define amdgpu_ps float @buffer_load_x(<4 x i32> inreg %rsrc) {
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    buffer_load_format_x v0, v0, s[0:3], null idxen
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
@@ -599,7 +599,7 @@ define amdgpu_ps float @buffer_load_x_i32(<4 x i32> inreg %rsrc) {
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    buffer_load_format_x v0, v0, s[0:3], null idxen
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
@@ -633,7 +633,7 @@ define amdgpu_ps <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) {
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    buffer_load_format_xy v[0:1], v0, s[0:3], null idxen
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
@@ -678,7 +678,7 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    buffer_load_format_xyzw v[2:6], v2, s[0:3], null idxen tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b128 v[0:1], v[2:5], off
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX12-NEXT:    ; return to shader part epilog
@@ -728,7 +728,7 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    buffer_load_format_xyzw v[2:6], v2, s[0:3], null idxen tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b128 v[0:1], v[2:5], off
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX12-NEXT:    ; return to shader part epilog
@@ -779,7 +779,7 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    buffer_load_format_xyz v[2:5], v2, s[0:3], null idxen tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b96 v[0:1], v[2:4], off
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX12-NEXT:    ; return to shader part epilog
@@ -830,7 +830,7 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    buffer_load_format_xyz v[2:5], v2, s[0:3], null idxen tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b96 v[0:1], v[2:4], off
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX12-NEXT:    ; return to shader part epilog
@@ -880,7 +880,7 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    buffer_load_format_xy v[2:4], v2, s[0:3], null idxen tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX12-NEXT:    ; return to shader part epilog
@@ -930,7 +930,7 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    buffer_load_format_xy v[2:4], v2, s[0:3], null idxen tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX12-NEXT:    ; return to shader part epilog
@@ -980,7 +980,7 @@ define amdgpu_cs float @buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrspace
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    buffer_load_format_x v[2:3], v2, s[0:3], null idxen tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    ; return to shader part epilog
@@ -1030,7 +1030,7 @@ define amdgpu_cs float @buffer_load_f32_tfe(<4 x i32> inreg %rsrc, ptr addrspace
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    buffer_load_format_x v[2:3], v2, s[0:3], null idxen tfe
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
index f0a73feaa4a14f..4c1ae4c228adb3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
@@ -135,7 +135,7 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) {
 ; GFX12-NEXT:    s_cbranch_execnz .LBB0_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xffff, v6
 ; GFX12-NEXT:    ds_store_2addr_b32 v2, v0, v1 offset0:7 offset1:8
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll
index be7c8de4e23e65..9be6b018a06412 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll
@@ -39,7 +39,7 @@ define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) {
 ; GFX12-PACKED:       ; %bb.0: ; %main_body
 ; GFX12-PACKED-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-PACKED-NEXT:    tbuffer_load_d16_format_x v0, v0, s[0:3], null format:[BUF_FMT_32_FLOAT] idxen
-; GFX12-PACKED-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PACKED-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PACKED-NEXT:    ; return to shader part epilog
 main_body:
   %data = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 22, i32 0)
@@ -83,7 +83,7 @@ define amdgpu_ps half @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) {
 ; GFX12-PACKED:       ; %bb.0: ; %main_body
 ; GFX12-PACKED-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-PACKED-NEXT:    tbuffer_load_d16_format_xy v0, v0, s[0:3], null format:[BUF_FMT_32_FLOAT] idxen
-; GFX12-PACKED-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PACKED-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PACKED-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX12-PACKED-NEXT:    ; return to shader part epilog
 main_body:
@@ -129,7 +129,7 @@ define amdgpu_ps half @tbuffer_load_d16_xyz(<4 x i32> inreg %rsrc) {
 ; GFX12-PACKED:       ; %bb.0: ; %main_body
 ; GFX12-PACKED-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-PACKED-NEXT:    tbuffer_load_d16_format_xyz v[0:1], v0, s[0:3], null format:[BUF_FMT_32_FLOAT] idxen
-; GFX12-PACKED-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PACKED-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PACKED-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-PACKED-NEXT:    ; return to shader part epilog
 main_body:
@@ -175,7 +175,7 @@ define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) {
 ; GFX12-PACKED:       ; %bb.0: ; %main_body
 ; GFX12-PACKED-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-PACKED-NEXT:    tbuffer_load_d16_format_xyzw v[0:1], v0, s[0:3], null format:[BUF_FMT_32_FLOAT] idxen
-; GFX12-PACKED-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-PACKED-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PACKED-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GFX12-PACKED-NEXT:    ; return to shader part epilog
 main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll
index 1c66ebf9688ff0..8a1f6911032137 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll
@@ -47,7 +47,7 @@ define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_l
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[4:7], v12, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] idxen th:TH_LOAD_NT
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[8:11], v12, s[0:3], null format:[BUF_FMT_32_FLOAT] idxen th:TH_LOAD_HT
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[12:15], v12, s[0:3], null format:[BUF_FMT_32_FLOAT] idxen th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
     %vdata     = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 78, i32 0)
@@ -90,7 +90,7 @@ define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) {
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[0:3], v0, s[0:3], null format:78 idxen offset:42
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
     %vdata   = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 42, i32 0, i32 78, i32 0)
@@ -136,7 +136,7 @@ define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_l
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[0:3], v8, s[0:3], s5 format:[BUF_FMT_8_8_8_8_SINT] idxen offset:4095
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[4:7], v8, s[0:3], s4 format:[BUF_FMT_32_32_32_32_SINT] idxen offset:73
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[8:11], v8, s[0:3], s4 format:77 idxen offset:1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
     %vdata     = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 4095, i32 61, i32 47, i32 0)
     %vdata_glc = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 73, i32 %soffs, i32 62, i32 0)
@@ -172,7 +172,7 @@ define amdgpu_vs <4 x float> @tbuffer_load_idx(<4 x i32> inreg, i32 %vindex) {
 ; GFX12-LABEL: tbuffer_load_idx:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[0:3], v0, s[0:3], null format:78 idxen
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
     %vdata   = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 78, i32 0)
@@ -212,7 +212,7 @@ define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) {
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[0:3], v[0:1], s[0:3], null format:78 idxen offen
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
     %vdata   = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 78, i32 0)
@@ -252,7 +252,7 @@ define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %voffs)
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[0:3], v[0:1], s[0:3], null format:78 idxen offen offset:52
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
     %ofs = add i32 %voffs, 52
@@ -283,7 +283,7 @@ define amdgpu_vs <4 x float> @tbuffer_load_both(<4 x i32> inreg, i32 %vindex, i3
 ; GFX12-LABEL: tbuffer_load_both:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[0:3], v[0:1], s[0:3], null format:78 idxen offen
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
     %vdata   = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 %voffs, i32 0, i32 78, i32 0)
@@ -317,7 +317,7 @@ define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) {
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    tbuffer_load_format_xy v[0:1], v0, s[0:3], null format:77 idxen
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
     %vdata = call <2 x i32> @llvm.amdgcn.struct.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 77, i32 0)
     %vdata.f = bitcast <2 x i32> %vdata to <2 x float>
@@ -350,7 +350,7 @@ define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) {
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    tbuffer_load_format_x v0, v0, s[0:3], null format:77 idxen
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
     %vdata = call i32 @llvm.amdgcn.struct.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 77, i32 0)
     %vdata.f = bitcast i32 %vdata to float
@@ -383,7 +383,7 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(<4 x i32> inreg) {
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[0:3], v0, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] idxen offset:4092
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 4092, i32 0, i32 63, i32 0)
@@ -422,7 +422,7 @@ define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_13bit(<4 x i32> inreg)
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[0:3], v0, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] idxen offset:8188
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 8188, i32 0, i32 63, i32 0)
@@ -461,7 +461,7 @@ define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_16bit(<4 x i32> inreg)
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[0:3], v0, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] idxen offset:65532
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 65532, i32 0, i32 63, i32 0)
@@ -500,7 +500,7 @@ define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_23bit(<4 x i32> inreg)
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    tbuffer_load_format_xyzw v[0:3], v0, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] idxen offset:8388604
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 8388604, i32 0, i32 63, i32 0)
@@ -539,14 +539,14 @@ define amdgpu_ps <4 x float> @tbuffer_load_voffset_large_24bit(<4 x i32> inreg)
 ; GFX12-SDAG:       ; %bb.0: ; %main_body
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 0x800000 :: v_dual_mov_b32 v0, 0
 ; GFX12-SDAG-NEXT:    tbuffer_load_format_xyzw v[0:3], v[0:1], s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:8388604
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: tbuffer_load_voffset_large_24bit:
 ; GFX12-GISEL:       ; %bb.0: ; %main_body
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x800000
 ; GFX12-GISEL-NEXT:    tbuffer_load_format_xyzw v[0:3], v[0:1], s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:8388604
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 16777212, i32 0, i32 63, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
index 22da2d0b023269..22ec22dc2db024 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
@@ -58,7 +58,7 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32
 ; GFX12-PACKED-NEXT:    s_clause 0x1
 ; GFX12-PACKED-NEXT:    s_load_b64 s[4:5], s[0:1], 0x10
 ; GFX12-PACKED-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-PACKED-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-PACKED-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX12-PACKED-NEXT:    tbuffer_store_d16_format_x v0, v1, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen
@@ -123,7 +123,7 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat
 ; GFX12-PACKED-NEXT:    s_clause 0x1
 ; GFX12-PACKED-NEXT:    s_load_b64 s[4:5], s[0:1], 0x10
 ; GFX12-PACKED-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-PACKED-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-PACKED-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX12-PACKED-NEXT:    tbuffer_store_d16_format_xy v0, v1, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen
@@ -200,7 +200,7 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da
 ; GFX12-PACKED-SDAG-NEXT:    s_clause 0x1
 ; GFX12-PACKED-SDAG-NEXT:    s_load_b96 s[4:6], s[0:1], 0x10
 ; GFX12-PACKED-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-PACKED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-PACKED-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PACKED-SDAG-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GFX12-PACKED-SDAG-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-PACKED-SDAG-NEXT:    v_mov_b32_e32 v1, s5
@@ -215,7 +215,7 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da
 ; GFX12-PACKED-GISEL-NEXT:    s_clause 0x1
 ; GFX12-PACKED-GISEL-NEXT:    s_load_b96 s[4:6], s[0:1], 0x10
 ; GFX12-PACKED-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-PACKED-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PACKED-GISEL-NEXT:    s_pack_lh_b32_b16 s4, s4, s4
 ; GFX12-PACKED-GISEL-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX12-PACKED-GISEL-NEXT:    v_mov_b32_e32 v0, s4
@@ -294,7 +294,7 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d
 ; GFX12-PACKED-NEXT:    s_clause 0x1
 ; GFX12-PACKED-NEXT:    s_load_b96 s[4:6], s[0:1], 0x10
 ; GFX12-PACKED-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-PACKED-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-PACKED-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX12-PACKED-NEXT:    v_mov_b32_e32 v2, s6
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll
index 9bbabce5b22076..85fc76eb182a3b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll
@@ -305,7 +305,7 @@ define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i3
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] idxen
 ; GFX12-NEXT:    buffer_load_format_xyzw v[0:3], v5, s[0:3], null idxen
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    tbuffer_store_format_xyzw v[0:3], v6, s[0:3], null format:[BUF_FMT_8_8_8_8_UINT] idxen
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll
index d299e760b87740..d5e62896232e25 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll
@@ -58,10 +58,10 @@ define amdgpu_kernel void @constant_load_v8f32(ptr addrspace(4) noalias nocaptur
 ; GFX12-LABEL: constant_load_v8f32:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[8:11], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s12, s[10:11], 0x0
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[8:9], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_f32 s0, s0, s12
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
 ; GFX12-NEXT:    s_add_f32 s0, s1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
index d00044c6ac1ab8..cfaefca3a516d7 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
@@ -48,10 +48,10 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac
 ; GFX12-LABEL: constant_load_f64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -138,10 +138,10 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[16:19], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[20:21], s[18:19], 0x0
 ; GFX12-NEXT:    s_load_b512 s[0:15], s[16:17], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_f64_e64 v[0:1], s[0:1], s[20:21]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_add_f64_e32 v[0:1], s[2:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 4ed4034a0348f4..cef0c8733d9912 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -67,9 +67,9 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX12-NEXT:    global_store_b8 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -141,9 +141,9 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -213,9 +213,9 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -286,9 +286,9 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -359,9 +359,9 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -432,9 +432,9 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -488,9 +488,9 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-LABEL: constant_load_v32i1:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -547,10 +547,10 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-LABEL: constant_load_v64i1:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -612,9 +612,9 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -679,9 +679,9 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_bfe_i32 v1, v1, 0, 1
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -744,9 +744,9 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -811,9 +811,9 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_bfe_i32 v1, v1, 0, 1
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -883,9 +883,9 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v2, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 1, v0
 ; GFX12-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -960,9 +960,9 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v2, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 1, v0
 ; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -1045,9 +1045,9 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v3, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v3, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 1, v0
 ; GFX12-NEXT:    v_lshrrev_b16 v2, 2, v0
 ; GFX12-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -1136,9 +1136,9 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v3, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v3, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 2, v0
 ; GFX12-NEXT:    v_lshrrev_b16 v4, 1, v0
 ; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 1
@@ -1225,9 +1225,9 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v4, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 2, v0
 ; GFX12-NEXT:    v_lshrrev_b16 v2, 1, v0
 ; GFX12-NEXT:    v_lshrrev_b16 v3, 3, v0
@@ -1322,9 +1322,9 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v4, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 3, v0
 ; GFX12-NEXT:    v_lshrrev_b16 v2, 2, v0
 ; GFX12-NEXT:    v_lshrrev_b16 v5, 1, v0
@@ -1442,9 +1442,9 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v8, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v8, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v2, 5, v0
 ; GFX12-NEXT:    v_lshrrev_b16 v5, 1, v0
 ; GFX12-NEXT:    v_lshrrev_b16 v6, 3, v0
@@ -1577,9 +1577,9 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v8, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v8, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 4, v0
 ; GFX12-NEXT:    v_lshrrev_b16 v4, 5, v0
 ; GFX12-NEXT:    v_lshrrev_b16 v5, 6, v0
@@ -1766,9 +1766,9 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v16, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v0, v16, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v2, 13, v0
 ; GFX12-NEXT:    v_lshrrev_b16 v13, 1, v0
 ; GFX12-NEXT:    v_lshrrev_b16 v15, 3, v0
@@ -1989,9 +1989,9 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v16, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v0, v16, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 12, v0
 ; GFX12-NEXT:    v_lshrrev_b16 v4, 13, v0
 ; GFX12-NEXT:    v_lshrrev_b16 v8, 14, v0
@@ -2347,9 +2347,9 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
 ; GFX12-LABEL: constant_zextload_v32i1_to_v32i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshr_b32 s3, s2, 24
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 13, s2
 ; GFX12-NEXT:    v_lshrrev_b16 v2, 9, s2
@@ -2768,9 +2768,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
 ; GFX12-LABEL: constant_sextload_v32i1_to_v32i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v0, 12, s2
 ; GFX12-NEXT:    v_lshrrev_b16 v12, 13, s2
 ; GFX12-NEXT:    v_lshrrev_b16 v13, 14, s2
@@ -3442,9 +3442,9 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX12-LABEL: constant_zextload_v64i1_to_v64i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v2, 13, s2
 ; GFX12-NEXT:    s_lshr_b32 s4, s3, 24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
@@ -4242,9 +4242,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX12-LABEL: constant_sextload_v64i1_to_v64i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshr_b32 s5, s2, 24
 ; GFX12-NEXT:    v_lshrrev_b16 v28, 12, s3
 ; GFX12-NEXT:    v_lshrrev_b16 v29, 13, s3
@@ -4454,9 +4454,9 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v1, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX12-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -4525,9 +4525,9 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v2, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -4597,9 +4597,9 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v1, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX12-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -4668,9 +4668,9 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v2, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -4748,9 +4748,9 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v1, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v2, 1, v0
 ; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 1, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -4832,9 +4832,9 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v4, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 1, v0
 ; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -4934,9 +4934,9 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v5, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v5, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v2, 1, v0
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 1, v0
 ; GFX12-NEXT:    v_lshrrev_b16 v3, 2, v0
@@ -5044,9 +5044,9 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v6, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v6, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 2, v0
 ; GFX12-NEXT:    v_lshrrev_b16 v2, 1, v0
 ; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 1
@@ -5161,9 +5161,9 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v1, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v6, 1, v0
 ; GFX12-NEXT:    v_lshrrev_b16 v2, 2, v0
 ; GFX12-NEXT:    v_lshrrev_b16 v4, 1, v0
@@ -5285,9 +5285,9 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v8, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v8, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 3, v0
 ; GFX12-NEXT:    v_lshrrev_b16 v2, 2, v0
 ; GFX12-NEXT:    v_lshrrev_b16 v3, 1, v0
@@ -5456,9 +5456,9 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v1, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v12, 1, v0
 ; GFX12-NEXT:    v_lshrrev_b16 v4, 5, v0
 ; GFX12-NEXT:    v_lshrrev_b16 v8, 3, v0
@@ -5645,9 +5645,9 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v16, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v1, v16, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v3, 6, v1
 ; GFX12-NEXT:    v_lshrrev_b16 v5, 7, v1
 ; GFX12-NEXT:    v_lshrrev_b16 v7, 4, v1
@@ -5933,9 +5933,9 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v0, v1, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v28, 1, v0
 ; GFX12-NEXT:    v_lshrrev_b16 v4, 11, v0
 ; GFX12-NEXT:    v_lshrrev_b16 v8, 9, v0
@@ -6266,9 +6266,9 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v32, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v1, v32, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v3, 14, v1
 ; GFX12-NEXT:    v_lshrrev_b16 v5, 15, v1
 ; GFX12-NEXT:    v_lshrrev_b16 v7, 12, v1
@@ -6781,9 +6781,9 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
 ; GFX12-LABEL: constant_zextload_v32i1_to_v32i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v0, 13, s2
 ; GFX12-NEXT:    v_lshrrev_b16 v3, 11, s2
 ; GFX12-NEXT:    s_lshr_b32 s3, s2, 24
@@ -7446,9 +7446,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
 ; GFX12-LABEL: constant_sextload_v32i1_to_v32i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v26, 6, s2
 ; GFX12-NEXT:    v_lshrrev_b16 v28, 7, s2
 ; GFX12-NEXT:    v_lshrrev_b16 v4, 2, s2
@@ -8432,9 +8432,9 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX12-LABEL: constant_zextload_v64i1_to_v64i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v4, 11, s2
 ; GFX12-NEXT:    v_lshrrev_b16 v6, 7, s2
 ; GFX12-NEXT:    s_lshr_b32 s4, s3, 24
@@ -9725,9 +9725,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_mov_b32 s19, s5
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshr_b32 s26, s3, 22
 ; GFX12-NEXT:    s_lshr_b32 s28, s3, 23
 ; GFX12-NEXT:    s_lshr_b32 s30, s3, 20
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 585f96b9ffb2e6..f48a693315671c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -79,9 +79,9 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -148,9 +148,9 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-LABEL: constant_load_v2i16:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -253,9 +253,9 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-LABEL: constant_load_v3i16:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-NEXT:    s_clause 0x1
@@ -329,10 +329,10 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-LABEL: constant_load_v4i16:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -409,10 +409,10 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-LABEL: constant_load_v8i16:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
@@ -523,9 +523,9 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs
 ; GFX12-LABEL: constant_load_v16i16:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[8:11], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v5, s1
@@ -744,7 +744,7 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v8, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_clause 0xf
 ; GFX12-NEXT:    global_load_u16 v3, v8, s[0:1] offset:28
 ; GFX12-NEXT:    global_load_u16 v2, v8, s[0:1] offset:24
@@ -762,9 +762,9 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
 ; GFX12-NEXT:    global_load_d16_hi_b16 v6, v8, s[0:1] offset:10
 ; GFX12-NEXT:    global_load_d16_hi_b16 v5, v8, s[0:1] offset:6
 ; GFX12-NEXT:    global_load_d16_hi_b16 v4, v8, s[0:1] offset:2
-; GFX12-NEXT:    s_waitcnt vmcnt(4)
+; GFX12-NEXT:    s_wait_loadcnt 0x4
 ; GFX12-NEXT:    global_store_b128 v[0:1], v[0:3], off
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b128 v[0:1], v[4:7], off
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -839,9 +839,9 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -917,9 +917,9 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_i16 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -994,9 +994,9 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1072,9 +1072,9 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_i16 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1153,9 +1153,9 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou
 ; GFX12-LABEL: constant_zextload_v2i16_to_v2i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_and_b32 s3, s2, 0xffff
 ; GFX12-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1241,9 +1241,9 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou
 ; GFX12-LABEL: constant_sextload_v2i16_to_v2i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_sext_i32_i16 s3, s2
 ; GFX12-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1339,9 +1339,9 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou
 ; GFX12-LABEL: constant_zextload_v3i16_to_v3i32:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX12-NEXT:    s_and_b32 s4, s2, 0xffff
 ; GFX12-NEXT:    s_lshr_b32 s2, s2, 16
@@ -1441,9 +1441,9 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou
 ; GFX12-LABEL: constant_sextload_v3i16_to_v3i32:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_ashr_i32 s4, s2, 16
 ; GFX12-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX12-NEXT:    s_sext_i32_i16 s3, s3
@@ -1546,9 +1546,9 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou
 ; GFX12-LABEL: constant_zextload_v4i16_to_v4i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshr_b32 s4, s3, 16
 ; GFX12-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX12-NEXT:    s_and_b32 s5, s2, 0xffff
@@ -1656,9 +1656,9 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou
 ; GFX12-LABEL: constant_sextload_v4i16_to_v4i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_ashr_i32 s4, s3, 16
 ; GFX12-NEXT:    s_ashr_i32 s5, s2, 16
 ; GFX12-NEXT:    s_sext_i32_i16 s2, s2
@@ -1808,9 +1808,9 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou
 ; GFX12-LABEL: constant_zextload_v8i16_to_v8i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshr_b32 s8, s7, 16
 ; GFX12-NEXT:    s_and_b32 s7, s7, 0xffff
 ; GFX12-NEXT:    s_and_b32 s9, s6, 0xffff
@@ -1970,9 +1970,9 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou
 ; GFX12-LABEL: constant_sextload_v8i16_to_v8i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_ashr_i32 s8, s7, 16
 ; GFX12-NEXT:    s_ashr_i32 s9, s6, 16
 ; GFX12-NEXT:    s_sext_i32_i16 s6, s6
@@ -2220,9 +2220,9 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) %
 ; GFX12-LABEL: constant_zextload_v16i16_to_v16i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b256 s[4:11], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshr_b32 s16, s11, 16
 ; GFX12-NEXT:    s_and_b32 s11, s11, 0xffff
 ; GFX12-NEXT:    s_and_b32 s17, s10, 0xffff
@@ -2488,9 +2488,9 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %
 ; GFX12-LABEL: constant_sextload_v16i16_to_v16i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b256 s[4:11], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_ashr_i32 s16, s11, 16
 ; GFX12-NEXT:    s_ashr_i32 s17, s10, 16
 ; GFX12-NEXT:    s_sext_i32_i16 s10, s10
@@ -2938,9 +2938,9 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
 ; GFX12-LABEL: constant_zextload_v32i16_to_v32i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[16:19], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b512 s[0:15], s[18:19], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshr_b32 s33, s15, 16
 ; GFX12-NEXT:    s_and_b32 s15, s15, 0xffff
 ; GFX12-NEXT:    s_and_b32 s34, s14, 0xffff
@@ -3428,9 +3428,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
 ; GFX12-LABEL: constant_sextload_v32i16_to_v32i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[16:19], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b512 s[0:15], s[18:19], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_ashr_i32 s33, s15, 16
 ; GFX12-NEXT:    s_ashr_i32 s34, s14, 16
 ; GFX12-NEXT:    s_sext_i32_i16 s14, s14
@@ -4291,11 +4291,11 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
 ; GFX12-LABEL: constant_zextload_v64i16_to_v64i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[36:39], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b512 s[16:31], s[38:39], 0x0
 ; GFX12-NEXT:    s_load_b512 s[0:15], s[38:39], 0x40
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshr_b32 s49, s31, 16
 ; GFX12-NEXT:    s_lshr_b32 s65, s15, 16
 ; GFX12-NEXT:    s_lshr_b32 s66, s14, 16
@@ -5230,11 +5230,11 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
 ; GFX12-LABEL: constant_sextload_v64i16_to_v64i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[36:39], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b512 s[0:15], s[38:39], 0x40
 ; GFX12-NEXT:    s_load_b512 s[16:31], s[38:39], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_ashr_i32 s65, s15, 16
 ; GFX12-NEXT:    s_ashr_i32 s66, s14, 16
 ; GFX12-NEXT:    s_sext_i32_i16 s14, s14
@@ -5431,9 +5431,9 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v0, v1, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX12-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -5521,9 +5521,9 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v0, v2, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -5606,9 +5606,9 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v0, v1, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX12-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -5691,9 +5691,9 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v0, v2, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -5783,9 +5783,9 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou
 ; GFX12-LABEL: constant_zextload_v2i16_to_v2i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_and_b32 s3, 0xffff, s2
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
@@ -5883,9 +5883,9 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou
 ; GFX12-LABEL: constant_sextload_v2i16_to_v2i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshr_b32 s4, s2, 16
 ; GFX12-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x100000
 ; GFX12-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x100000
@@ -6010,9 +6010,9 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou
 ; GFX12-LABEL: constant_zextload_v4i16_to_v4i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_and_b32 s4, 0xffff, s2
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
@@ -6157,9 +6157,9 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou
 ; GFX12-LABEL: constant_sextload_v4i16_to_v4i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s6, s3
 ; GFX12-NEXT:    s_lshr_b32 s8, s3, 16
 ; GFX12-NEXT:    s_bfe_i64 s[4:5], s[2:3], 0x100000
@@ -6358,9 +6358,9 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou
 ; GFX12-LABEL: constant_zextload_v8i16_to_v8i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_and_b32 s2, 0xffff, s7
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
@@ -6599,9 +6599,9 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
 ; GFX12-LABEL: constant_sextload_v8i16_to_v8i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s14, s7
 ; GFX12-NEXT:    s_lshr_b32 s16, s7, 16
 ; GFX12-NEXT:    s_bfe_i64 s[12:13], s[6:7], 0x100000
@@ -6946,9 +6946,9 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
 ; GFX12-LABEL: constant_zextload_v16i16_to_v16i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[8:11], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_and_b32 s10, s5, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s10
@@ -7382,9 +7382,9 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
 ; GFX12-LABEL: constant_sextload_v16i16_to_v16i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[8:11], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s30, s5
 ; GFX12-NEXT:    s_lshr_b32 s34, s5, 16
 ; GFX12-NEXT:    s_bfe_i64 s[28:29], s[4:5], 0x100000
@@ -8027,9 +8027,9 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GFX12-LABEL: constant_zextload_v32i16_to_v32i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[16:19], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b512 s[0:15], s[18:19], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_and_b32 s18, s15, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s18
@@ -8855,9 +8855,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GFX12-LABEL: constant_sextload_v32i16_to_v32i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[16:19], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b512 s[0:15], s[18:19], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshr_b32 s28, s2, 16
 ; GFX12-NEXT:    s_lshr_b32 s42, s5, 16
 ; GFX12-NEXT:    s_lshr_b32 s52, s8, 16
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index 16f95409055b19..96ceba8ae3b66e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -74,9 +74,9 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac
 ; GFX12-LABEL: constant_load_i32:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -159,10 +159,10 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-LABEL: constant_load_v2i32:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -256,9 +256,9 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-LABEL: constant_load_v3i32:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b96 s[4:6], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4
 ; GFX12-NEXT:    v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
 ; GFX12-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
@@ -350,10 +350,10 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-LABEL: constant_load_v4i32:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
@@ -478,9 +478,9 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-LABEL: constant_load_v8i32:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[8:11], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v5, s1
@@ -637,11 +637,11 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-LABEL: constant_load_v9i32:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[8:11], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s12, s[10:11], 0x20
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v9, s12
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
@@ -804,12 +804,12 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs
 ; GFX12-LABEL: constant_load_v10i32:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[8:11], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[12:13], s[10:11], 0x20
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v10, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v0, s4
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s7 :: v_dual_mov_b32 v2, s6
@@ -983,11 +983,11 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs
 ; GFX12-LABEL: constant_load_v11i32:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[8:11], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b96 s[12:14], s[10:11], 0x20
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v8, s12
 ; GFX12-NEXT:    v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v10, s14
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
@@ -1159,12 +1159,12 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs
 ; GFX12-LABEL: constant_load_v12i32:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[8:11], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[12:15], s[10:11], 0x20
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v12, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v3, s15
 ; GFX12-NEXT:    v_dual_mov_b32 v1, s13 :: v_dual_mov_b32 v2, s14
 ; GFX12-NEXT:    v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, s4
@@ -1366,9 +1366,9 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs
 ; GFX12-LABEL: constant_load_v16i32:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[16:19], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b512 s[0:15], s[18:19], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s13
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v3, s15
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v5, s9
@@ -1463,9 +1463,9 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p
 ; GFX12-LABEL: constant_zextload_i32_to_i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -1553,9 +1553,9 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p
 ; GFX12-LABEL: constant_sextload_i32_to_i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_ashr_i32 s3, s2, 31
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
@@ -1641,9 +1641,9 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou
 ; GFX12-LABEL: constant_zextload_v1i32_to_v1i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -1731,9 +1731,9 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou
 ; GFX12-LABEL: constant_sextload_v1i32_to_v1i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_ashr_i32 s3, s2, 31
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
@@ -1830,9 +1830,9 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou
 ; GFX12-LABEL: constant_zextload_v2i32_to_v2i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
@@ -1941,9 +1941,9 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou
 ; GFX12-LABEL: constant_sextload_v2i32_to_v2i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX12-NEXT:    s_ashr_i32 s5, s2, 31
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -2071,9 +2071,9 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou
 ; GFX12-LABEL: constant_zextload_v4i32_to_v4i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, s7
@@ -2227,9 +2227,9 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou
 ; GFX12-LABEL: constant_sextload_v4i32_to_v4i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_ashr_i32 s8, s7, 31
 ; GFX12-NEXT:    s_ashr_i32 s9, s6, 31
 ; GFX12-NEXT:    s_ashr_i32 s2, s5, 31
@@ -2425,9 +2425,9 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou
 ; GFX12-LABEL: constant_zextload_v8i32_to_v8i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[8:11], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, s7
@@ -2684,9 +2684,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou
 ; GFX12-LABEL: constant_sextload_v8i32_to_v8i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b256 s[4:11], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_ashr_i32 s16, s11, 31
 ; GFX12-NEXT:    s_ashr_i32 s17, s10, 31
 ; GFX12-NEXT:    s_ashr_i32 s14, s9, 31
@@ -3143,9 +3143,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
 ; GFX12-LABEL: constant_sextload_v16i32_to_v16i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[16:19], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b512 s[0:15], s[18:19], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_ashr_i32 s28, s11, 31
 ; GFX12-NEXT:    s_ashr_i32 s29, s10, 31
 ; GFX12-NEXT:    s_ashr_i32 s33, s15, 31
@@ -3504,9 +3504,9 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) %
 ; GFX12-LABEL: constant_zextload_v16i32_to_v16i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[16:19], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b512 s[0:15], s[18:19], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s14
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, s15
@@ -4356,11 +4356,11 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
 ; GFX12-LABEL: constant_sextload_v32i32_to_v32i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[36:39], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b512 s[0:15], s[38:39], 0x0
 ; GFX12-NEXT:    s_load_b512 s[16:31], s[38:39], 0x40
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_ashr_i32 s49, s15, 31
 ; GFX12-NEXT:    s_ashr_i32 s64, s31, 31
 ; GFX12-NEXT:    s_ashr_i32 s65, s30, 31
@@ -5032,11 +5032,11 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %
 ; GFX12-LABEL: constant_zextload_v32i32_to_v32i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[36:39], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b512 s[16:31], s[38:39], 0x40
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-NEXT:    s_load_b512 s[0:15], s[38:39], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s30 :: v_dual_mov_b32 v3, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s31
@@ -5427,11 +5427,11 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
 ; GFX12-LABEL: constant_load_v32i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[36:39], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b512 s[16:31], s[38:39], 0x40
 ; GFX12-NEXT:    s_load_b512 s[0:15], s[38:39], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v1, s29
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v3, s31
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s30 :: v_dual_mov_b32 v5, s25
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
index 9b3830671acbd6..e1b7d67b208df6 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
@@ -64,10 +64,10 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac
 ; GFX12-LABEL: constant_load_i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -143,10 +143,10 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-LABEL: constant_load_v2i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
@@ -254,11 +254,11 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-LABEL: constant_load_v3i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[8:9], s[2:3], 0x10
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, s9
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
@@ -373,9 +373,9 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-LABEL: constant_load_v4i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[8:11], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v5, s1
@@ -559,9 +559,9 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-LABEL: constant_load_v8i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[16:19], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b512 s[0:15], s[18:19], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s13
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v3, s15
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v5, s9
@@ -900,11 +900,11 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs
 ; GFX12-LABEL: constant_load_v16i64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[36:39], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b512 s[16:31], s[38:39], 0x40
 ; GFX12-NEXT:    s_load_b512 s[0:15], s[38:39], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v1, s29
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v3, s31
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s30 :: v_dual_mov_b32 v5, s25
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index f18a34515a8265..3c819c12434707 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -80,9 +80,9 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -167,9 +167,9 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -279,9 +279,9 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-LABEL: constant_load_v3i8:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_d16_hi_b8 v0, v1, s[0:1] offset:2
@@ -351,9 +351,9 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-LABEL: constant_load_v4i8:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -424,10 +424,10 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-LABEL: constant_load_v8i8:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -504,10 +504,10 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-LABEL: constant_load_v16i8:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
@@ -584,9 +584,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -662,9 +662,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_i8 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -739,9 +739,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -817,9 +817,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_i8 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -913,9 +913,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v0, v2, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 8, v0
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -1013,9 +1013,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v0, v2, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 8, v0
 ; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -1113,9 +1113,9 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out
 ; GFX12-LABEL: constant_zextload_v3i8_to_v3i32:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s2
 ; GFX12-NEXT:    s_and_b32 s3, s2, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s2, s2, 0x80010
@@ -1217,9 +1217,9 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out
 ; GFX12-LABEL: constant_sextload_v3i8_to_v3i32:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s2
 ; GFX12-NEXT:    s_sext_i32_i8 s3, s2
 ; GFX12-NEXT:    s_bfe_i32 s2, s2, 0x80010
@@ -1323,9 +1323,9 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out
 ; GFX12-LABEL: constant_zextload_v4i8_to_v4i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s2
 ; GFX12-NEXT:    s_lshr_b32 s3, s2, 24
 ; GFX12-NEXT:    s_and_b32 s4, s2, 0xff
@@ -1432,9 +1432,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out
 ; GFX12-LABEL: constant_sextload_v4i8_to_v4i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s2
 ; GFX12-NEXT:    s_ashr_i32 s3, s2, 24
 ; GFX12-NEXT:    s_sext_i32_i8 s4, s2
@@ -1584,9 +1584,9 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
 ; GFX12-LABEL: constant_zextload_v8i8_to_v8i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s2
 ; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s3
 ; GFX12-NEXT:    s_lshr_b32 s5, s2, 24
@@ -1748,9 +1748,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
 ; GFX12-LABEL: constant_sextload_v8i8_to_v8i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s2
 ; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s3
 ; GFX12-NEXT:    s_ashr_i32 s6, s2, 24
@@ -2000,9 +2000,9 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o
 ; GFX12-LABEL: constant_zextload_v16i8_to_v16i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s7
 ; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s6
 ; GFX12-NEXT:    v_lshrrev_b16 v9, 8, s5
@@ -2276,9 +2276,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
 ; GFX12-LABEL: constant_sextload_v16i8_to_v16i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s7
 ; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s6
 ; GFX12-NEXT:    v_lshrrev_b16 v9, 8, s5
@@ -2725,9 +2725,9 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
 ; GFX12-LABEL: constant_zextload_v32i8_to_v32i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b256 s[4:11], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s11
 ; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s10
 ; GFX12-NEXT:    v_lshrrev_b16 v9, 8, s9
@@ -3232,9 +3232,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
 ; GFX12-LABEL: constant_sextload_v32i8_to_v32i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b256 s[4:11], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s11
 ; GFX12-NEXT:    v_lshrrev_b16 v9, 8, s9
 ; GFX12-NEXT:    v_lshrrev_b16 v10, 8, s8
@@ -4087,9 +4087,9 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX12-LABEL: constant_zextload_v64i8_to_v64i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[16:19], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b512 s[0:15], s[18:19], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v0, 8, s15
 ; GFX12-NEXT:    v_lshrrev_b16 v9, 8, s2
 ; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s1
@@ -5052,9 +5052,9 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX12-LABEL: constant_sextload_v64i8_to_v64i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[16:19], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b512 s[0:15], s[18:19], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v0, 8, s15
 ; GFX12-NEXT:    s_ashr_i32 s49, s15, 24
 ; GFX12-NEXT:    s_bfe_i32 s50, s15, 0x80010
@@ -5268,9 +5268,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v1, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX12-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -5354,9 +5354,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_i8 v0, v2, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -5438,9 +5438,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v1, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5523,9 +5523,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_i8 v0, v2, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -5629,9 +5629,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v0, v1, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v2, 8, v0
 ; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xff, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -5740,9 +5740,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v0, v4, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 8, v0
 ; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -5871,9 +5871,9 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
 ; GFX12-LABEL: constant_zextload_v4i8_to_v4i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v4, 8, s2
 ; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x80010
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -6022,9 +6022,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out
 ; GFX12-LABEL: constant_sextload_v4i8_to_v4i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s2
 ; GFX12-NEXT:    s_lshr_b32 s4, s2, 16
 ; GFX12-NEXT:    s_lshr_b32 s6, s2, 24
@@ -6228,9 +6228,9 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
 ; GFX12-LABEL: constant_zextload_v8i8_to_v8i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_bfe_u32 s4, s3, 0x80010
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
@@ -6481,9 +6481,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v6, 8, s2
 ; GFX12-NEXT:    v_lshrrev_b16 v7, 8, s3
 ; GFX12-NEXT:    s_lshr_b32 s6, s3, 16
@@ -6834,9 +6834,9 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
 ; GFX12-LABEL: constant_zextload_v16i8_to_v16i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_bfe_u32 s2, s7, 0x80010
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
@@ -7283,9 +7283,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
 ; GFX12-LABEL: constant_sextload_v16i8_to_v16i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v10, 8, s7
 ; GFX12-NEXT:    v_lshrrev_b16 v11, 8, s6
 ; GFX12-NEXT:    v_lshrrev_b16 v21, 8, s5
@@ -7942,9 +7942,9 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; GFX12-LABEL: constant_zextload_v32i8_to_v32i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[8:11], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_bfe_u32 s10, s7, 0x80010
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s10
@@ -8801,9 +8801,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; GFX12-LABEL: constant_sextload_v32i8_to_v32i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[8:11], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s7
 ; GFX12-NEXT:    v_lshrrev_b16 v2, 8, s6
 ; GFX12-NEXT:    v_lshrrev_b16 v4, 8, s5
@@ -9010,9 +9010,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -9098,9 +9098,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_i8 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -9184,9 +9184,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -9272,9 +9272,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_i8 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -9366,9 +9366,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xff, v1
 ; GFX12-NEXT:    v_lshrrev_b16 v1, 8, v1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -9479,9 +9479,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_bfe_i32 v2, v1, 0, 8
 ; GFX12-NEXT:    v_ashrrev_i16 v1, 8, v1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -9608,9 +9608,9 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v3, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshr_b32 s3, s2, 16
 ; GFX12-NEXT:    v_and_b32_e64 v0, 0xff, s2
 ; GFX12-NEXT:    v_and_b32_e64 v1, 0xff, s3
@@ -9758,9 +9758,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
 ; GFX12-LABEL: constant_sextload_v4i8_to_v4i16:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_bfe_i32 s4, s2, 0x80000
 ; GFX12-NEXT:    s_lshr_b32 s3, s2, 16
 ; GFX12-NEXT:    v_ashrrev_i16 v0, 8, s2
@@ -9949,9 +9949,9 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
 ; GFX12-LABEL: constant_zextload_v8i8_to_v8i16:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX12-NEXT:    s_lshr_b32 s6, s3, 16
 ; GFX12-NEXT:    v_and_b32_e64 v0, 0xff, s2
@@ -10180,9 +10180,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
 ; GFX12-LABEL: constant_sextload_v8i8_to_v8i16:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_bfe_i32 s8, s2, 0x80000
 ; GFX12-NEXT:    s_bfe_i32 s9, s3, 0x80000
 ; GFX12-NEXT:    s_lshr_b32 s6, s2, 16
@@ -10510,9 +10510,9 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ; GFX12-LABEL: constant_zextload_v16i8_to_v16i16:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshr_b32 s3, s6, 16
 ; GFX12-NEXT:    s_lshr_b32 s9, s7, 16
 ; GFX12-NEXT:    s_lshr_b32 s11, s4, 16
@@ -10921,9 +10921,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ; GFX12-LABEL: constant_sextload_v16i8_to_v16i16:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshr_b32 s8, s6, 16
 ; GFX12-NEXT:    v_ashrrev_i16 v5, 8, s6
 ; GFX12-NEXT:    s_bfe_i32 s6, s6, 0x80000
@@ -11532,9 +11532,9 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ; GFX12-LABEL: constant_zextload_v32i8_to_v32i16:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[8:11], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshr_b32 s25, s1, 16
 ; GFX12-NEXT:    s_lshr_b32 s21, s3, 16
 ; GFX12-NEXT:    s_lshr_b32 s23, s0, 16
@@ -12306,9 +12306,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ; GFX12-LABEL: constant_sextload_v32i8_to_v32i16:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[8:11], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshr_b32 s12, s4, 16
 ; GFX12-NEXT:    s_lshr_b32 s14, s2, 16
 ; GFX12-NEXT:    v_ashrrev_i16 v4, 8, s2
diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
index fb3c04235b8e4d..0dbce3b6b5fe6f 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
@@ -5,12 +5,12 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s
 ; GCN-LABEL: copy_flat:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_b32 s4, s[0:1], 0x34
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_cmp_eq_u32 s4, 0
 ; GCN-NEXT:    s_cbranch_scc1 .LBB0_3
 ; GCN-NEXT:  ; %bb.1: ; %for.body.preheader
 ; GCN-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 0xb0
 ; GCN-NEXT:    .p2align 6
 ; GCN-NEXT:  .LBB0_2: ; %for.body
@@ -24,7 +24,7 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s
 ; GCN-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 16
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 0
 ; GCN-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
-; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-NEXT:    flat_store_b128 v[4:5], v[0:3]
 ; GCN-NEXT:    s_cbranch_scc1 .LBB0_2
 ; GCN-NEXT:  .LBB0_3: ; %for.end
@@ -52,13 +52,13 @@ define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrsp
 ; GCN-LABEL: copy_global:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_b32 s4, s[0:1], 0x34
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_cmp_eq_u32 s4, 0
 ; GCN-NEXT:    s_cbranch_scc1 .LBB1_3
 ; GCN-NEXT:  ; %bb.1: ; %for.body.preheader
 ; GCN-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 0xb0
 ; GCN-NEXT:  .LBB1_2: ; %for.body
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -67,7 +67,7 @@ define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrsp
 ; GCN-NEXT:    s_add_co_i32 s4, s4, -1
 ; GCN-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 16
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    global_store_b128 v0, v[1:4], s[0:1]
 ; GCN-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
 ; GCN-NEXT:    s_cbranch_scc1 .LBB1_2
@@ -98,7 +98,7 @@ define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addr
 ; GCN-LABEL: copy_constant:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_b32 s4, s[0:1], 0x34
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_cmp_eq_u32 s4, 0
 ; GCN-NEXT:    s_cbranch_scc1 .LBB2_3
 ; GCN-NEXT:  ; %bb.1: ; %for.body.preheader
@@ -106,13 +106,13 @@ define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addr
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:  .LBB2_2: ; %for.body
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_load_b128 s[8:11], s[2:3], 0x0
 ; GCN-NEXT:    s_prefetch_data s[2:3], 0xb0, null, 0
 ; GCN-NEXT:    s_add_co_i32 s4, s4, -1
 ; GCN-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 16
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9
 ; GCN-NEXT:    v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11
 ; GCN-NEXT:    global_store_b128 v0, v[1:4], s[0:1]
@@ -145,7 +145,7 @@ define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspa
 ; GCN-LABEL: copy_local:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_cmp_eq_u32 s2, 0
 ; GCN-NEXT:    s_cbranch_scc1 .LBB3_2
 ; GCN-NEXT:  .LBB3_1: ; %for.body
@@ -158,9 +158,9 @@ define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspa
 ; GCN-NEXT:    ds_load_2addr_b32 v[0:1], v2 offset0:2 offset1:3
 ; GCN-NEXT:    ds_load_2addr_b32 v[2:3], v2 offset1:1
 ; GCN-NEXT:    s_cmp_lg_u32 s2, 0
-; GCN-NEXT:    s_waitcnt lgkmcnt(1)
+; GCN-NEXT:    s_wait_dscnt 0x1
 ; GCN-NEXT:    ds_store_2addr_b32 v4, v0, v1 offset0:2 offset1:3
-; GCN-NEXT:    s_waitcnt lgkmcnt(1)
+; GCN-NEXT:    s_wait_dscnt 0x1
 ; GCN-NEXT:    ds_store_2addr_b32 v4, v2, v3 offset1:1
 ; GCN-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GCN-NEXT:  .LBB3_2: ; %for.end
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
index 047cb3ab400084..b6263fc314f92b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
@@ -188,9 +188,9 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s2 th:TH_LOAD_NT_HT
-; GFX12-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s2 th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-WGP-NEXT:    s_nop 0
 ; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -202,9 +202,9 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s2 th:TH_LOAD_NT_HT
-; GFX12-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s2 th:TH_LOAD_NT
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-CU-NEXT:    s_nop 0
 ; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -398,10 +398,10 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX12-WGP-NEXT:    scratch_load_b32 v0, v0, off th:TH_LOAD_NT_HT
-; GFX12-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, v0, off th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-WGP-NEXT:    s_nop 0
 ; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -413,10 +413,10 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX12-CU-NEXT:    scratch_load_b32 v0, v0, off th:TH_LOAD_NT_HT
-; GFX12-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-CU-NEXT:    scratch_load_b32 v0, v0, off th:TH_LOAD_NT
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-CU-NEXT:    s_nop 0
 ; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -596,21 +596,21 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ; GFX12-WGP-LABEL: private_nontemporal_store_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
-; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s2 th:TH_STORE_NT_WB
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s2 th:TH_STORE_NT
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: private_nontemporal_store_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
-; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s2 th:TH_STORE_NT_WB
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s2 th:TH_STORE_NT
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(5) %out) {
 entry:
@@ -792,23 +792,23 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ; GFX12-WGP-LABEL: private_nontemporal_store_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
-; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT:    scratch_store_b32 v0, v1, off th:TH_STORE_NT_WB
+; GFX12-WGP-NEXT:    scratch_store_b32 v0, v1, off th:TH_STORE_NT
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: private_nontemporal_store_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
-; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    scratch_store_b32 v0, v1, off th:TH_STORE_NT_WB
+; GFX12-CU-NEXT:    scratch_store_b32 v0, v1, off th:TH_STORE_NT
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(5) %out) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
index 4b1fb295adec2a..0fa4ea8acfbb3e 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
@@ -132,9 +132,9 @@ define amdgpu_kernel void @private_volatile_load_0(
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s2 th:TH_LOAD_RT_NT
-; GFX12-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s2 scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-WGP-NEXT:    s_nop 0
 ; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -146,9 +146,9 @@ define amdgpu_kernel void @private_volatile_load_0(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s2 th:TH_LOAD_RT_NT
-; GFX12-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s2 scope:SCOPE_SYS
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-CU-NEXT:    s_nop 0
 ; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -288,10 +288,10 @@ define amdgpu_kernel void @private_volatile_load_1(
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX12-WGP-NEXT:    scratch_load_b32 v0, v0, off th:TH_LOAD_RT_NT
-; GFX12-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, v0, off scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-WGP-NEXT:    s_nop 0
 ; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -303,10 +303,10 @@ define amdgpu_kernel void @private_volatile_load_1(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX12-CU-NEXT:    scratch_load_b32 v0, v0, off th:TH_LOAD_RT_NT
-; GFX12-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-CU-NEXT:    scratch_load_b32 v0, v0, off scope:SCOPE_SYS
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-CU-NEXT:    s_nop 0
 ; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -441,23 +441,23 @@ define amdgpu_kernel void @private_volatile_store_0(
 ; GFX12-WGP-LABEL: private_volatile_store_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
-; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s2 th:TH_STORE_NT_RT
-; GFX12-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s2 scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: private_volatile_store_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
-; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s2 th:TH_STORE_NT_RT
-; GFX12-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s2 scope:SCOPE_SYS
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(5) %out) {
 entry:
@@ -592,25 +592,25 @@ define amdgpu_kernel void @private_volatile_store_1(
 ; GFX12-WGP-LABEL: private_volatile_store_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
-; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT:    scratch_store_b32 v0, v1, off th:TH_STORE_NT_RT
-; GFX12-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-WGP-NEXT:    scratch_store_b32 v0, v1, off scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: private_volatile_store_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
-; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    scratch_store_b32 v0, v1, off th:TH_STORE_NT_RT
-; GFX12-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX12-CU-NEXT:    scratch_store_b32 v0, v1, off scope:SCOPE_SYS
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(5) %out) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index e2617fc453b58f..8cc85dbd6cf11c 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -114,13 +114,13 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1
 ; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX12-NEXT:    s_mov_b32 s10, s6
 ; GFX12-NEXT:    s_mov_b32 s11, s7
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s8, s2
 ; GFX12-NEXT:    s_mov_b32 s9, s3
 ; GFX12-NEXT:    s_mov_b32 s4, s0
 ; GFX12-NEXT:    buffer_load_b128 v[0:3], off, s[8:11], null
 ; GFX12-NEXT:    s_mov_b32 s5, s1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mul_lo_u32 v1, v1, v3
 ; GFX12-NEXT:    v_mul_lo_u32 v0, v0, v2
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
@@ -276,7 +276,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX12-NEXT:    s_mov_b32 s10, s6
 ; GFX12-NEXT:    s_mov_b32 s11, s7
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s8, s2
 ; GFX12-NEXT:    s_mov_b32 s9, s3
 ; GFX12-NEXT:    s_clause 0x1
@@ -284,7 +284,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX12-NEXT:    buffer_load_b128 v[4:7], off, s[8:11], null offset:16
 ; GFX12-NEXT:    s_mov_b32 s4, s0
 ; GFX12-NEXT:    s_mov_b32 s5, s1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mul_lo_u32 v3, v3, v7
 ; GFX12-NEXT:    v_mul_lo_u32 v2, v2, v6
 ; GFX12-NEXT:    v_mul_lo_u32 v1, v1, v5
@@ -403,7 +403,7 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a,
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX12-NEXT:    s_mul_i32 s0, s0, s6
 ; GFX12-NEXT:    s_mov_b32 s6, -1
@@ -556,14 +556,14 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add
 ; GFX12-NEXT:    s_mov_b32 s15, s11
 ; GFX12-NEXT:    s_mov_b32 s2, s10
 ; GFX12-NEXT:    s_mov_b32 s3, s11
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s12, s6
 ; GFX12-NEXT:    s_mov_b32 s13, s7
 ; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null
 ; GFX12-NEXT:    buffer_load_b32 v1, off, s[0:3], null
 ; GFX12-NEXT:    s_mov_b32 s8, s4
 ; GFX12-NEXT:    s_mov_b32 s9, s5
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
 ; GFX12-NEXT:    s_nop 0
@@ -677,7 +677,7 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) {
 ; GFX12-LABEL: mul64_sext_c:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_ashr_i32 s3, s2, 31
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_mul_u64 s[4:5], s[2:3], 0x50
@@ -786,7 +786,7 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) {
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
 ; GFX12-NEXT:    s_mov_b32 s3, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mul_u64 s[4:5], s[2:3], 0x50
 ; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
@@ -922,13 +922,13 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1
 ; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX12-NEXT:    s_mov_b32 s10, s6
 ; GFX12-NEXT:    s_mov_b32 s11, s7
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s8, s2
 ; GFX12-NEXT:    s_mov_b32 s9, s3
 ; GFX12-NEXT:    s_mov_b32 s4, s0
 ; GFX12-NEXT:    buffer_load_b32 v0, off, s[8:11], null
 ; GFX12-NEXT:    s_mov_b32 s5, s1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mul_hi_i32 v1, 0x50, v0
 ; GFX12-NEXT:    v_mul_lo_u32 v0, 0x50, v0
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
@@ -1069,13 +1069,13 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1
 ; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX12-NEXT:    s_mov_b32 s10, s6
 ; GFX12-NEXT:    s_mov_b32 s11, s7
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s8, s2
 ; GFX12-NEXT:    s_mov_b32 s9, s3
 ; GFX12-NEXT:    s_mov_b32 s4, s0
 ; GFX12-NEXT:    buffer_load_b32 v0, off, s[8:11], null
 ; GFX12-NEXT:    s_mov_b32 s5, s1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mul_hi_u32 v1, 0x50, v0
 ; GFX12-NEXT:    v_mul_lo_u32 v0, 0x50, v0
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
@@ -1213,13 +1213,13 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad
 ; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX12-NEXT:    s_mov_b32 s10, s6
 ; GFX12-NEXT:    s_mov_b32 s11, s7
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s8, s2
 ; GFX12-NEXT:    s_mov_b32 s9, s3
 ; GFX12-NEXT:    s_mov_b32 s4, s0
 ; GFX12-NEXT:    buffer_load_b32 v0, off, s[8:11], null
 ; GFX12-NEXT:    s_mov_b32 s5, s1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mul_hi_i32 v1, 9, v0
 ; GFX12-NEXT:    v_mul_lo_u32 v0, 9, v0
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
@@ -1329,7 +1329,7 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [
 ; GFX12-NEXT:    s_load_b32 s2, s[0:1], 0x4c
 ; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0x70
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mul_i32 s2, s2, s3
 ; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
@@ -1455,13 +1455,13 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX12-NEXT:    s_mov_b32 s10, s6
 ; GFX12-NEXT:    s_mov_b32 s11, s7
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s8, s2
 ; GFX12-NEXT:    s_mov_b32 s9, s3
 ; GFX12-NEXT:    s_mov_b32 s4, s0
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], off, s[8:11], null
 ; GFX12-NEXT:    s_mov_b32 s5, s1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX12-NEXT:    buffer_store_b32 v0, off, s[4:7], null
 ; GFX12-NEXT:    s_nop 0
@@ -1573,7 +1573,7 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8
 ; GFX12-NEXT:    s_load_b32 s2, s[0:1], 0x4c
 ; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0x70
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mul_lo_u16 v0, s2, s3
 ; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX12-NEXT:    s_mov_b32 s2, -1
@@ -1730,7 +1730,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX12-NEXT:    s_mov_b32 s10, s6
 ; GFX12-NEXT:    s_mov_b32 s11, s7
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s8, s2
 ; GFX12-NEXT:    s_mov_b32 s9, s3
 ; GFX12-NEXT:    s_clause 0x1
@@ -1738,7 +1738,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GFX12-NEXT:    buffer_load_u8 v1, off, s[8:11], null offset:4
 ; GFX12-NEXT:    s_mov_b32 s4, s0
 ; GFX12-NEXT:    s_mov_b32 s5, s1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -1898,7 +1898,7 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mul_u64 s[0:1], s[6:7], s[0:1]
 ; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
@@ -2080,14 +2080,14 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
 ; GFX12-NEXT:    s_mov_b32 s3, s11
 ; GFX12-NEXT:    s_mov_b32 s14, s10
 ; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s12, s6
 ; GFX12-NEXT:    s_mov_b32 s13, s7
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], null
 ; GFX12-NEXT:    buffer_load_b64 v[2:3], off, s[12:15], null
 ; GFX12-NEXT:    s_mov_b32 s8, s4
 ; GFX12-NEXT:    s_mov_b32 s9, s5
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mul_lo_u32 v3, v0, v3
 ; GFX12-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX12-NEXT:    v_mul_hi_u32 v4, v0, v2
@@ -2313,7 +2313,7 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x34
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX12-NEXT:    s_cbranch_scc0 .LBB15_2
 ; GFX12-NEXT:  ; %bb.1: ; %else
@@ -2329,7 +2329,7 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12-NEXT:  ; %bb.4: ; %if
 ; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX12-NEXT:    s_mov_b32 s6, -1
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s4, s2
 ; GFX12-NEXT:    s_mov_b32 s5, s3
 ; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
@@ -2337,10 +2337,10 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12-NEXT:  .LBB15_5:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX12-NEXT:  .LBB15_6: ; %endif
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX12-NEXT:    s_mov_b32 s2, -1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_store_b32 v0, off, s[0:3], null
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2580,7 +2580,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12-LABEL: mul64_in_branch:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX12-NEXT:    s_cbranch_scc0 .LBB16_3
 ; GFX12-NEXT:  ; %bb.1: ; %else
@@ -2603,7 +2603,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12-NEXT:  .LBB16_5: ; %endif
 ; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX12-NEXT:    s_mov_b32 s2, -1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2926,7 +2926,7 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
 ; GFX12-NEXT:    s_mov_b32 s17, s3
 ; GFX12-NEXT:    s_mov_b32 s19, s3
 ; GFX12-NEXT:    s_mov_b32 s24, s3
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s2, s4
 ; GFX12-NEXT:    s_mov_b32 s14, s8
 ; GFX12-NEXT:    s_mov_b32 s12, s9
@@ -3209,11 +3209,11 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x2c
 ; GFX12-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v15, 4, v0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_load_b128 v[0:3], v15, s[0:1]
 ; GFX12-NEXT:    global_load_b128 v[4:7], v15, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[8:9], null, v0, v4, 0
 ; GFX12-NEXT:    v_mul_lo_u32 v14, v5, v2
 ; GFX12-NEXT:    v_mul_lo_u32 v3, v4, v3
@@ -3347,7 +3347,11 @@ define i32 @mul_pow2_plus_1(i32 %val) {
 ;
 ; GFX12-LABEL: mul_pow2_plus_1:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
index 5a8814e7b20fa3..6e27d3f51d6980 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
@@ -37,9 +37,13 @@ define i8 @flat_inst_valu_offset_1(ptr %p) {
 ;
 ; GFX12-LABEL: flat_inst_valu_offset_1:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:1
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 1
   %load = load i8, ptr %gep, align 4
@@ -72,9 +76,13 @@ define i8 @flat_inst_valu_offset_11bit_max(ptr %p) {
 ;
 ; GFX12-LABEL: flat_inst_valu_offset_11bit_max:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:2047
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 2047
   %load = load i8, ptr %gep, align 4
@@ -107,9 +115,13 @@ define i8 @flat_inst_valu_offset_12bit_max(ptr %p) {
 ;
 ; GFX12-LABEL: flat_inst_valu_offset_12bit_max:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 4095
   %load = load i8, ptr %gep, align 4
@@ -146,9 +158,13 @@ define i8 @flat_inst_valu_offset_13bit_max(ptr %p) {
 ;
 ; GFX12-LABEL: flat_inst_valu_offset_13bit_max:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:8191
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_13bit_max:
@@ -203,9 +219,13 @@ define i8 @flat_inst_valu_offset_24bit_max(ptr %p) {
 ;
 ; GFX12-LABEL: flat_inst_valu_offset_24bit_max:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:8388607
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_24bit_max:
@@ -260,9 +280,13 @@ define i8 @flat_inst_valu_offset_neg_11bit_max(ptr %p) {
 ;
 ; GFX12-LABEL: flat_inst_valu_offset_neg_11bit_max:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-2048
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -2048
   %load = load i8, ptr %gep, align 4
@@ -299,9 +323,13 @@ define i8 @flat_inst_valu_offset_neg_12bit_max(ptr %p) {
 ;
 ; GFX12-LABEL: flat_inst_valu_offset_neg_12bit_max:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-4096
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -4096
   %load = load i8, ptr %gep, align 4
@@ -338,9 +366,13 @@ define i8 @flat_inst_valu_offset_neg_13bit_max(ptr %p) {
 ;
 ; GFX12-LABEL: flat_inst_valu_offset_neg_13bit_max:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-8192
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -8192
   %load = load i8, ptr %gep, align 4
@@ -377,9 +409,13 @@ define i8 @flat_inst_valu_offset_neg_24bit_max(ptr %p) {
 ;
 ; GFX12-LABEL: flat_inst_valu_offset_neg_24bit_max:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-8388608
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -8388608
   %load = load i8, ptr %gep, align 4
@@ -413,9 +449,13 @@ define i8 @flat_inst_valu_offset_2x_11bit_max(ptr %p) {
 ;
 ; GFX12-LABEL: flat_inst_valu_offset_2x_11bit_max:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 4095
   %load = load i8, ptr %gep, align 4
@@ -452,9 +492,13 @@ define i8 @flat_inst_valu_offset_2x_12bit_max(ptr %p) {
 ;
 ; GFX12-LABEL: flat_inst_valu_offset_2x_12bit_max:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:8191
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_12bit_max:
@@ -509,9 +553,13 @@ define i8 @flat_inst_valu_offset_2x_13bit_max(ptr %p) {
 ;
 ; GFX12-LABEL: flat_inst_valu_offset_2x_13bit_max:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:16383
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_13bit_max:
@@ -566,11 +614,15 @@ define i8 @flat_inst_valu_offset_2x_24bit_max(ptr %p) {
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_2x_24bit_max:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:8388606
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_24bit_max:
@@ -593,11 +645,15 @@ define i8 @flat_inst_valu_offset_2x_24bit_max(ptr %p) {
 ;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_24bit_max:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffffe, v0
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 16777214
   %load = load i8, ptr %gep, align 4
@@ -634,9 +690,13 @@ define i8 @flat_inst_valu_offset_2x_neg_11bit_max(ptr %p) {
 ;
 ; GFX12-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-4096
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -4096
   %load = load i8, ptr %gep, align 4
@@ -673,9 +733,13 @@ define i8 @flat_inst_valu_offset_2x_neg_12bit_max(ptr %p) {
 ;
 ; GFX12-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-8192
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -8192
   %load = load i8, ptr %gep, align 4
@@ -712,9 +776,13 @@ define i8 @flat_inst_valu_offset_2x_neg_13bit_max(ptr %p) {
 ;
 ; GFX12-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-16384
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -16384
   %load = load i8, ptr %gep, align 4
@@ -751,20 +819,28 @@ define i8 @flat_inst_valu_offset_2x_neg_24bit_max(ptr %p) {
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8388607
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff000001, v0
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -16777215
   %load = load i8, ptr %gep, align 4
@@ -802,11 +878,15 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:2047
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
@@ -851,7 +931,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
 ;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x7ff
 ; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -860,7 +944,7 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 8589936639
   %load = load i8, ptr %gep, align 4
@@ -898,11 +982,15 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) {
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:2048
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
@@ -947,7 +1035,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) {
 ;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x800
 ; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -956,7 +1048,7 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 8589936640
   %load = load i8, ptr %gep, align 4
@@ -994,11 +1086,15 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
@@ -1043,7 +1139,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
 ;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0xfff
 ; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1052,7 +1152,7 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 8589938687
   %load = load i8, ptr %gep, align 4
@@ -1090,11 +1190,15 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) {
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4096
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
@@ -1139,7 +1243,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) {
 ;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1000
 ; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1148,7 +1256,7 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 8589938688
   %load = load i8, ptr %gep, align 4
@@ -1186,11 +1294,15 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:8191
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
@@ -1235,7 +1347,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
 ;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1fff
 ; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1244,7 +1360,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 8589942783
   %load = load i8, ptr %gep, align 4
@@ -1282,11 +1398,15 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) {
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:8192
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
@@ -1331,7 +1451,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) {
 ;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x2000
 ; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1340,7 +1464,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 8589942784
   %load = load i8, ptr %gep, align 4
@@ -1379,11 +1503,15 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8386561
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
@@ -1428,7 +1556,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
 ;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x7ff
 ; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1437,7 +1569,7 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -9223372036854773761
   %load = load i8, ptr %gep, align 4
@@ -1476,11 +1608,15 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8386560
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
@@ -1525,7 +1661,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
 ;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x800
 ; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1534,7 +1674,7 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -9223372036854773760
   %load = load i8, ptr %gep, align 4
@@ -1573,11 +1713,15 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8384513
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
@@ -1622,7 +1766,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
 ;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0xfff
 ; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1631,7 +1779,7 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -9223372036854771713
   %load = load i8, ptr %gep, align 4
@@ -1670,11 +1818,15 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8384512
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
@@ -1719,7 +1871,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
 ;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1000
 ; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1728,7 +1884,7 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -9223372036854771712
   %load = load i8, ptr %gep, align 4
@@ -1767,11 +1923,15 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8380417
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
@@ -1816,7 +1976,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
 ;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1fff
 ; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1825,7 +1989,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -9223372036854767617
   %load = load i8, ptr %gep, align 4
@@ -1864,11 +2028,15 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8380416
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
@@ -1913,7 +2081,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
 ;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x2000
 ; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1922,7 +2094,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -9223372036854767616
   %load = load i8, ptr %gep, align 4
@@ -1967,10 +2139,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) {
 ; GFX12-LABEL: flat_inst_salu_offset_1:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:1 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:1 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 1
@@ -2017,10 +2189,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) {
 ; GFX12-LABEL: flat_inst_salu_offset_11bit_max:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:2047 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 2047
@@ -2067,10 +2239,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) {
 ; GFX12-LABEL: flat_inst_salu_offset_12bit_max:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 4095
@@ -2121,10 +2293,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
 ; GFX12-LABEL: flat_inst_salu_offset_13bit_max:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:8191 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-NEXT:    s_endpgm
 ;
@@ -2201,10 +2373,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
 ; GFX12-LABEL: flat_inst_salu_offset_neg_11bit_max:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-2048 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-2048 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-NEXT:    s_endpgm
 ;
@@ -2281,10 +2453,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
 ; GFX12-LABEL: flat_inst_salu_offset_neg_12bit_max:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-4096 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-NEXT:    s_endpgm
 ;
@@ -2361,10 +2533,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
 ; GFX12-LABEL: flat_inst_salu_offset_neg_13bit_max:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-8192 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-NEXT:    s_endpgm
 ;
@@ -2437,10 +2609,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) {
 ; GFX12-LABEL: flat_inst_salu_offset_2x_11bit_max:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 4095
@@ -2491,10 +2663,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
 ; GFX12-LABEL: flat_inst_salu_offset_2x_12bit_max:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:8191 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-NEXT:    s_endpgm
 ;
@@ -2571,10 +2743,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
 ; GFX12-LABEL: flat_inst_salu_offset_2x_13bit_max:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:16383 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:16383 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-NEXT:    s_endpgm
 ;
@@ -2651,10 +2823,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
 ; GFX12-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-4096 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-NEXT:    s_endpgm
 ;
@@ -2731,10 +2903,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
 ; GFX12-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-8192 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-NEXT:    s_endpgm
 ;
@@ -2811,10 +2983,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
 ; GFX12-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-16384 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-16384 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-NEXT:    s_endpgm
 ;
@@ -2891,12 +3063,12 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
 ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:2047 th:TH_LOAD_RT_NT
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
@@ -2929,13 +3101,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
 ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x7ff
 ; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 8589936639
@@ -2986,12 +3158,12 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
 ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:2048 th:TH_LOAD_RT_NT
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:2048 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
@@ -3024,13 +3196,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
 ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x800
 ; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 8589936640
@@ -3081,12 +3253,12 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
 ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 th:TH_LOAD_RT_NT
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
@@ -3119,13 +3291,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
 ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0xfff
 ; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 8589938687
@@ -3177,12 +3349,12 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
 ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4096 th:TH_LOAD_RT_NT
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4096 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
@@ -3215,13 +3387,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
 ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x1000
 ; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 8589938688
@@ -3273,12 +3445,12 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
 ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:8191 th:TH_LOAD_RT_NT
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
@@ -3311,13 +3483,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
 ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x1fff
 ; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 8589942783
@@ -3369,12 +3541,12 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
 ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:8192 th:TH_LOAD_RT_NT
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:8192 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
@@ -3407,13 +3579,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
 ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x2000
 ; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 8589942784
@@ -3467,13 +3639,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
 ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8386561 th:TH_LOAD_RT_NT
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8386561 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
@@ -3506,13 +3678,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
 ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x7ff
 ; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0x80000000
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 -9223372036854773761
@@ -3566,13 +3738,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
 ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8386560 th:TH_LOAD_RT_NT
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8386560 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
@@ -3605,13 +3777,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
 ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x800
 ; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0x80000000
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 -9223372036854773760
@@ -3665,13 +3837,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
 ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8384513 th:TH_LOAD_RT_NT
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8384513 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
@@ -3704,13 +3876,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
 ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0xfff
 ; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0x80000000
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 -9223372036854771713
@@ -3764,13 +3936,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
 ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8384512 th:TH_LOAD_RT_NT
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8384512 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
@@ -3803,13 +3975,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
 ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x1000
 ; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0x80000000
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 -9223372036854771712
@@ -3863,13 +4035,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
 ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8380417 th:TH_LOAD_RT_NT
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8380417 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
@@ -3902,13 +4074,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
 ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x1fff
 ; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0x80000000
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 -9223372036854767617
@@ -3962,13 +4134,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
 ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8380416 th:TH_LOAD_RT_NT
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8380416 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
@@ -4001,13 +4173,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
 ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x2000
 ; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0x80000000
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] th:TH_LOAD_RT_NT
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 -9223372036854767616
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
index 88be7e54ced107..97dd288004edf6 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
@@ -35,9 +35,13 @@ define i8 @global_inst_valu_offset_1(ptr addrspace(1) %p) {
 ;
 ; GFX12-LABEL: global_inst_valu_offset_1:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:1
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 1
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -68,9 +72,13 @@ define i8 @global_inst_valu_offset_11bit_max(ptr addrspace(1) %p) {
 ;
 ; GFX12-LABEL: global_inst_valu_offset_11bit_max:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:2047
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 2047
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -103,9 +111,13 @@ define i8 @global_inst_valu_offset_12bit_max(ptr addrspace(1) %p) {
 ;
 ; GFX12-LABEL: global_inst_valu_offset_12bit_max:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: global_inst_valu_offset_12bit_max:
@@ -151,9 +163,13 @@ define i8 @global_inst_valu_offset_13bit_max(ptr addrspace(1) %p) {
 ;
 ; GFX12-LABEL: global_inst_valu_offset_13bit_max:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:8191
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_13bit_max:
@@ -217,9 +233,13 @@ define i8 @global_inst_valu_offset_24bit_max(ptr addrspace(1) %p) {
 ;
 ; GFX12-LABEL: global_inst_valu_offset_24bit_max:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:8388607
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_24bit_max:
@@ -277,9 +297,13 @@ define i8 @global_inst_valu_offset_neg_11bit_max(ptr addrspace(1) %p) {
 ;
 ; GFX12-LABEL: global_inst_valu_offset_neg_11bit_max:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:-2048
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -2048
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -312,9 +336,13 @@ define i8 @global_inst_valu_offset_neg_12bit_max(ptr addrspace(1) %p) {
 ;
 ; GFX12-LABEL: global_inst_valu_offset_neg_12bit_max:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -351,9 +379,13 @@ define i8 @global_inst_valu_offset_neg_13bit_max(ptr addrspace(1) %p) {
 ;
 ; GFX12-LABEL: global_inst_valu_offset_neg_13bit_max:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:-8192
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -390,9 +422,13 @@ define i8 @global_inst_valu_offset_neg_24bit_max(ptr addrspace(1) %p) {
 ;
 ; GFX12-LABEL: global_inst_valu_offset_neg_24bit_max:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:-8388608
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8388608
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -425,9 +461,13 @@ define i8 @global_inst_valu_offset_2x_11bit_max(ptr addrspace(1) %p) {
 ;
 ; GFX12-LABEL: global_inst_valu_offset_2x_11bit_max:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: global_inst_valu_offset_2x_11bit_max:
@@ -473,9 +513,13 @@ define i8 @global_inst_valu_offset_2x_12bit_max(ptr addrspace(1) %p) {
 ;
 ; GFX12-LABEL: global_inst_valu_offset_2x_12bit_max:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:8191
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_12bit_max:
@@ -539,9 +583,13 @@ define i8 @global_inst_valu_offset_2x_13bit_max(ptr addrspace(1) %p) {
 ;
 ; GFX12-LABEL: global_inst_valu_offset_2x_13bit_max:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:16383
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_13bit_max:
@@ -605,11 +653,15 @@ define i8 @global_inst_valu_offset_2x_24bit_max(ptr addrspace(1) %p) {
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_2x_24bit_max:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffffe, v0
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_24bit_max:
@@ -641,11 +693,15 @@ define i8 @global_inst_valu_offset_2x_24bit_max(ptr addrspace(1) %p) {
 ;
 ; GFX12-SDAG-LABEL: global_inst_valu_offset_2x_24bit_max:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:8388606
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 16777214
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -678,9 +734,13 @@ define i8 @global_inst_valu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) {
 ;
 ; GFX12-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -717,9 +777,13 @@ define i8 @global_inst_valu_offset_2x_neg_12bit_max(ptr addrspace(1) %p) {
 ;
 ; GFX12-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:-8192
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -756,9 +820,13 @@ define i8 @global_inst_valu_offset_2x_neg_13bit_max(ptr addrspace(1) %p) {
 ;
 ; GFX12-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:-16384
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -16384
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -795,11 +863,15 @@ define i8 @global_inst_valu_offset_2x_neg_24bit_max(ptr addrspace(1) %p) {
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff000001, v0
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
@@ -831,11 +903,15 @@ define i8 @global_inst_valu_offset_2x_neg_24bit_max(ptr addrspace(1) %p) {
 ;
 ; GFX12-SDAG-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-8388607
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -16777215
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -887,7 +963,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_split0:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x7ff
 ; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -896,7 +976,7 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split0:
@@ -928,11 +1008,15 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
 ;
 ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split0:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2047
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936639
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -983,7 +1067,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_split1:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x800
 ; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -992,7 +1080,7 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split1:
@@ -1024,11 +1112,15 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
 ;
 ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split1:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2048
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936640
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -1079,7 +1171,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split0:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0xfff
 ; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1088,7 +1184,7 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split0:
@@ -1120,11 +1216,15 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
 ;
 ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split0:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938687
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -1175,7 +1275,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split1:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1000
 ; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1184,7 +1288,7 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split1:
@@ -1216,11 +1320,15 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
 ;
 ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split1:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4096
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938688
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -1271,7 +1379,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split0:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1fff
 ; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1280,7 +1392,7 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split0:
@@ -1312,11 +1424,15 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
 ;
 ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split0:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:8191
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942783
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -1367,7 +1483,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split1:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x2000
 ; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1376,7 +1496,7 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split1:
@@ -1408,11 +1528,15 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
 ;
 ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split1:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:8192
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942784
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -1463,7 +1587,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1)
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x7ff
 ; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1472,7 +1600,7 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1)
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
@@ -1505,11 +1633,15 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1)
 ;
 ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-8386561
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773761
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -1560,7 +1692,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1)
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x800
 ; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1569,7 +1705,7 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1)
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
@@ -1602,11 +1738,15 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1)
 ;
 ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-8386560
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773760
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -1657,7 +1797,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1)
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0xfff
 ; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1666,7 +1810,7 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1)
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
@@ -1699,11 +1843,15 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1)
 ;
 ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-8384513
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771713
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -1754,7 +1902,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1)
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1000
 ; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1763,7 +1915,7 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1)
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
@@ -1796,11 +1948,15 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1)
 ;
 ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-8384512
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771712
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -1851,7 +2007,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1)
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1fff
 ; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1860,7 +2020,7 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1)
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
@@ -1893,11 +2053,15 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1)
 ;
 ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-8380417
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767617
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -1948,7 +2112,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x2000
 ; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1957,7 +2125,7 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
@@ -1990,11 +2158,15 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
 ;
 ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-8380416
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767616
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -2038,9 +2210,9 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) {
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:1 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:1 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2088,9 +2260,9 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:2047 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:2047 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2138,9 +2310,9 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2188,9 +2360,9 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:8191 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2238,9 +2410,9 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-2048 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-2048 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2291,9 +2463,9 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-4096 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2360,9 +2532,9 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-8192 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2435,9 +2607,9 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1)
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2485,9 +2657,9 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1)
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:8191 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2535,9 +2707,9 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1)
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:16383 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:16383 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2588,9 +2760,9 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-4096 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2657,9 +2829,9 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-8192 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2740,9 +2912,9 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-16384 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-16384 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2823,13 +2995,13 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split0:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x7ff
 ; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off th:TH_LOAD_RT_NT
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2863,12 +3035,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
 ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2047 th:TH_LOAD_RT_NT
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2047 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2924,13 +3096,13 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x800
 ; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off th:TH_LOAD_RT_NT
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2964,12 +3136,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
 ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2048 th:TH_LOAD_RT_NT
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2048 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3025,13 +3197,13 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split0:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0xfff
 ; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off th:TH_LOAD_RT_NT
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3065,12 +3237,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
 ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095 th:TH_LOAD_RT_NT
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3126,13 +3298,13 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x1000
 ; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off th:TH_LOAD_RT_NT
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3166,12 +3338,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
 ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4096 th:TH_LOAD_RT_NT
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4096 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3227,13 +3399,13 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split0:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x1fff
 ; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off th:TH_LOAD_RT_NT
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3267,12 +3439,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
 ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:8191 th:TH_LOAD_RT_NT
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:8191 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3328,13 +3500,13 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x2000
 ; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 2
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off th:TH_LOAD_RT_NT
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3368,12 +3540,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
 ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:8192 th:TH_LOAD_RT_NT
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:8192 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3428,11 +3600,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x7ff
 ; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0x80000000
-; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3444,10 +3616,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-SDAG-NEXT:    s_movk_i32 s2, 0x7ff
 ; GFX12-SDAG-NEXT:    s_brev_b32 s3, 1
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3502,11 +3674,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x800
 ; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0x80000000
-; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3518,10 +3690,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-SDAG-NEXT:    s_movk_i32 s2, 0x800
 ; GFX12-SDAG-NEXT:    s_brev_b32 s3, 1
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3576,11 +3748,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0xfff
 ; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0x80000000
-; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3592,10 +3764,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-SDAG-NEXT:    s_movk_i32 s2, 0xfff
 ; GFX12-SDAG-NEXT:    s_brev_b32 s3, 1
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3650,11 +3822,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x1000
 ; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0x80000000
-; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3666,10 +3838,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-SDAG-NEXT:    s_movk_i32 s2, 0x1000
 ; GFX12-SDAG-NEXT:    s_brev_b32 s3, 1
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3724,11 +3896,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x1fff
 ; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0x80000000
-; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3740,10 +3912,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-SDAG-NEXT:    s_movk_i32 s2, 0x1fff
 ; GFX12-SDAG-NEXT:    s_brev_b32 s3, 1
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3798,11 +3970,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x2000
 ; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0x80000000
-; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT
-; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3814,10 +3986,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-SDAG-NEXT:    s_movk_i32 s2, 0x2000
 ; GFX12-SDAG-NEXT:    s_brev_b32 s3, 1
-; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] th:TH_LOAD_RT_NT
-; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
index 17b3fdc04ec934..c272e8e9788ea0 100644
--- a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
+++ b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
@@ -21,9 +21,9 @@ declare i64 @llvm.readcyclecounter() #0
 ; GFX12:       s_getreg_b32 [[HI2:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES_HI)
 ; GFX12:       s_cmp_eq_u32 [[HI1]], [[HI2]]
 ; GFX12:       s_cselect_b32 {{s[0-9]+}}, [[LO1]], 0
-; GCN-DAG:     lgkmcnt
+; GCN-DAG:     kmcnt
 ; MEMTIME:     store_dwordx2
-; SIVI-NOT:    lgkmcnt
+; SIVI-NOT:    kmcnt
 ; MEMTIME:     s_memtime s{{\[[0-9]+:[0-9]+\]}}
 ; MEMTIME:     store_dwordx2
 
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index 4695cadd45aeed..65c75363f43466 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -44,7 +44,7 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ; GFX12-LABEL: s_sub_i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_sub_co_i32 s2, s2, s3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
@@ -96,7 +96,7 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) {
 ; GFX12-LABEL: s_sub_imm_i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_sub_co_i32 s2, 0x4d2, s2
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
@@ -157,9 +157,9 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
 ; GFX12-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -221,9 +221,9 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_sub_nc_u32_e32 v1, 0x7b, v1
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -286,9 +286,9 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b128 v[0:3], v4, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v1, v3
 ; GFX12-NEXT:    v_sub_nc_u32_e32 v0, v0, v2
 ; GFX12-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
@@ -367,11 +367,11 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v8, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_load_b128 v[0:3], v8, s[2:3] offset:16
 ; GFX12-NEXT:    global_load_b128 v[4:7], v8, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_sub_nc_u32_e32 v3, v7, v3
 ; GFX12-NEXT:    v_sub_nc_u32_e32 v2, v6, v2
 ; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v5, v1
@@ -449,11 +449,11 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3] th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    global_load_u16 v0, v0, s[2:3] offset:2 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_load_u16 v0, v0, s[2:3] offset:2 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_sub_nc_u16 v0, v1, v0
 ; GFX12-NEXT:    global_store_b16 v2, v0, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -530,9 +530,9 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_sub_i16 v0, v0, v1
 ; GFX12-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -620,9 +620,9 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_sub_i16 v1, v1, v3
 ; GFX12-NEXT:    v_pk_sub_i16 v0, v0, v2
 ; GFX12-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
@@ -686,7 +686,7 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x2c
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_sub_nc_u64 s[2:3], s[4:5], s[6:7]
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
@@ -767,11 +767,11 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[6:7]
 ; GFX12-NEXT:    global_load_b64 v[2:3], v2, s[0:1]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
 ; GFX12-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-NEXT:    global_store_b64 v4, v[0:1], s[4:5]
@@ -862,11 +862,11 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v8, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_load_b128 v[0:3], v4, s[6:7]
 ; GFX12-NEXT:    global_load_b128 v[4:7], v4, s[0:1]
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_sub_co_u32 v2, vcc_lo, v2, v6
 ; GFX12-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
 ; GFX12-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v4
@@ -991,16 +991,16 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v12, 5, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v16, 0
-; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_clause 0x3
 ; GFX12-NEXT:    global_load_b128 v[0:3], v12, s[6:7]
 ; GFX12-NEXT:    global_load_b128 v[4:7], v12, s[0:1]
 ; GFX12-NEXT:    global_load_b128 v[8:11], v12, s[6:7] offset:16
 ; GFX12-NEXT:    global_load_b128 v[12:15], v12, s[0:1] offset:16
-; GFX12-NEXT:    s_waitcnt vmcnt(2)
+; GFX12-NEXT:    s_wait_loadcnt 0x2
 ; GFX12-NEXT:    v_sub_co_u32 v2, vcc_lo, v2, v6
 ; GFX12-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
-; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_sub_co_u32 v10, vcc_lo, v10, v14
 ; GFX12-NEXT:    v_sub_co_ci_u32_e32 v11, vcc_lo, v11, v15, vcc_lo
 ; GFX12-NEXT:    v_sub_co_u32 v8, vcc_lo, v8, v12
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-global-inv-wb.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-global-inv-wb.mir
index c06e931c65d5e4..ab20333cee9d0d 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-global-inv-wb.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-global-inv-wb.mir
@@ -19,7 +19,7 @@ body: |
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, killed $vgpr0, 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX12-NEXT: GLOBAL_INV 16, implicit $exec
-    ; GFX12-NEXT: S_WAITCNT 1015
+    ; GFX12-NEXT: S_WAIT_LOADCNT 0
     ; GFX12-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $exec
     renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, killed $vgpr0, 0, 0, implicit $exec :: (load (s32), addrspace 1)
     GLOBAL_INV 16, implicit $exec



More information about the llvm-commits mailing list