[llvm] 08b8d46 - [AMDGPU][GFX1250] Insert S_WAIT_XCNT for SMEM and VMEM load-stores (#145566)

via llvm-commits llvm-commits at lists.llvm.org
Tue Jun 24 22:10:40 PDT 2025


Author: Christudasan Devadasan
Date: 2025-06-25T10:40:36+05:30
New Revision: 08b8d467d4253373e77a075c03e25281dee8ad15

URL: https://github.com/llvm/llvm-project/commit/08b8d467d4253373e77a075c03e25281dee8ad15
DIFF: https://github.com/llvm/llvm-project/commit/08b8d467d4253373e77a075c03e25281dee8ad15.diff

LOG: [AMDGPU][GFX1250] Insert S_WAIT_XCNT for SMEM and VMEM load-stores (#145566)

This patch tracks the register operands of both VMEM (FLAT, MUBUF,
MTBUF) and SMEM load-store operations and inserts a S_WAIT_XCNT
instruction with sufficient wait-count before potentially redefining
them. For VMEM instructions, XNACK is returned in the same order as
they were issued and hence non-zero counter values can be inserted.
However, SMEM execution is out-of-order and so is their XNACK reception.
Thus, only zero counter value can be inserted to capture SMEM dependencies.

Added: 
    llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
    llvm/test/CodeGen/AMDGPU/wait-xcnt.mir

Modified: 
    llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 9a7dd3c31e498..f43831016952a 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -73,6 +73,7 @@ enum InstCounterType {
   SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
   BVH_CNT,                           // gfx12+ only.
   KM_CNT,                            // gfx12+ only.
+  X_CNT,                             // gfx1250.
   NUM_EXTENDED_INST_CNTS,
   NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
 };
@@ -102,6 +103,7 @@ struct HardwareLimits {
   unsigned SamplecntMax; // gfx12+ only.
   unsigned BvhcntMax;    // gfx12+ only.
   unsigned KmcntMax;     // gfx12+ only.
+  unsigned XcntMax;      // gfx1250.
 };
 
 #define AMDGPU_DECLARE_WAIT_EVENTS(DECL)                                       \
@@ -111,10 +113,12 @@ struct HardwareLimits {
   DECL(VMEM_BVH_READ_ACCESS)     /* vmem BVH read (gfx12+ only) */             \
   DECL(VMEM_WRITE_ACCESS)        /* vmem write that is not scratch */          \
   DECL(SCRATCH_WRITE_ACCESS)     /* vmem write that may be scratch */          \
+  DECL(VMEM_GROUP)               /* vmem group */                              \
   DECL(LDS_ACCESS)               /* lds read & write */                        \
   DECL(GDS_ACCESS)               /* gds read & write */                        \
   DECL(SQ_MESSAGE)               /* send message */                            \
   DECL(SMEM_ACCESS)              /* scalar-memory read & write */              \
+  DECL(SMEM_GROUP)               /* scalar-memory group */                     \
   DECL(EXP_GPR_LOCK)             /* export holding on its data src */          \
   DECL(GDS_GPR_LOCK)             /* GDS holding on its data and addr src */    \
   DECL(EXP_POS_ACCESS)           /* write to export position */                \
@@ -178,7 +182,7 @@ enum VmemType {
 static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
     AMDGPU::S_WAIT_LOADCNT,  AMDGPU::S_WAIT_DSCNT,     AMDGPU::S_WAIT_EXPCNT,
     AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
-    AMDGPU::S_WAIT_KMCNT};
+    AMDGPU::S_WAIT_KMCNT,    AMDGPU::S_WAIT_XCNT};
 
 static bool updateVMCntOnly(const MachineInstr &Inst) {
   return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
@@ -223,6 +227,8 @@ unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
     return Wait.BvhCnt;
   case KM_CNT:
     return Wait.KmCnt;
+  case X_CNT:
+    return Wait.XCnt;
   default:
     llvm_unreachable("bad InstCounterType");
   }
@@ -283,12 +289,27 @@ class WaitcntBrackets {
       return Limits.BvhcntMax;
     case KM_CNT:
       return Limits.KmcntMax;
+    case X_CNT:
+      return Limits.XcntMax;
     default:
       break;
     }
     return 0;
   }
 
+  bool isSmemCounter(InstCounterType T) const {
+    return T == SmemAccessCounter || T == X_CNT;
+  }
+
+  unsigned getSgprScoresIdx(InstCounterType T) const {
+    if (T == SmemAccessCounter)
+      return 0;
+    if (T == X_CNT)
+      return 1;
+
+    llvm_unreachable("Invalid SMEM counter");
+  }
+
   unsigned getScoreLB(InstCounterType T) const {
     assert(T < NUM_INST_CNTS);
     return ScoreLBs[T];
@@ -307,8 +328,8 @@ class WaitcntBrackets {
     if (GprNo < NUM_ALL_VGPRS) {
       return VgprScores[T][GprNo];
     }
-    assert(T == SmemAccessCounter);
-    return SgprScores[GprNo - NUM_ALL_VGPRS];
+    assert(isSmemCounter(T));
+    return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
   }
 
   bool merge(const WaitcntBrackets &Other);
@@ -331,6 +352,7 @@ class WaitcntBrackets {
 
   void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
   void applyWaitcnt(InstCounterType T, unsigned Count);
+  void applyXcnt(const AMDGPU::Waitcnt &Wait);
   void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
                      const MachineRegisterInfo *MRI, WaitEventType E,
                      MachineInstr &MI);
@@ -462,9 +484,11 @@ class WaitcntBrackets {
   int VgprUB = -1;
   int SgprUB = -1;
   unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
-  // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt
-  // pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
-  unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
+  // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
+  // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
+  // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
+  // X_CNT score.
+  unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
   // Bitmask of the VmemTypes of VMEM instructions that might have a pending
   // write to each vgpr.
   unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
@@ -572,6 +596,7 @@ class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
         eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
         0,
         0,
+        0,
         0};
 
     return WaitEventMaskForInstPreGFX12;
@@ -607,7 +632,8 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
         eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
         eventMask({VMEM_SAMPLER_READ_ACCESS}),
         eventMask({VMEM_BVH_READ_ACCESS}),
-        eventMask({SMEM_ACCESS, SQ_MESSAGE})};
+        eventMask({SMEM_ACCESS, SQ_MESSAGE}),
+        eventMask({VMEM_GROUP, SMEM_GROUP})};
 
     return WaitEventMaskForInstGFX12Plus;
   }
@@ -743,9 +769,12 @@ class SIInsertWaitcnts {
     return VmemReadMapping[getVmemType(Inst)];
   }
 
+  bool hasXcnt() const { return ST->hasWaitXCnt(); }
+
   bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
   bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
+  bool isVmemAccess(const MachineInstr &MI) const;
   bool generateWaitcntInstBefore(MachineInstr &MI,
                                  WaitcntBrackets &ScoreBrackets,
                                  MachineInstr *OldWaitcntInstr,
@@ -837,9 +866,9 @@ void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
       VgprUB = std::max(VgprUB, RegNo);
       VgprScores[CntTy][RegNo] = Score;
     } else {
-      assert(CntTy == SmemAccessCounter);
+      assert(isSmemCounter(CntTy));
       SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS);
-      SgprScores[RegNo - NUM_ALL_VGPRS] = Score;
+      SgprScores[getSgprScoresIdx(CntTy)][RegNo - NUM_ALL_VGPRS] = Score;
     }
   }
 }
@@ -976,6 +1005,13 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
           setScoreByOperand(&Inst, TRI, MRI, Op, EXP_CNT, CurrScore);
       }
     }
+  } else if (T == X_CNT) {
+    for (const MachineOperand &Op : Inst.all_uses()) {
+      RegInterval Interval = getRegInterval(&Inst, MRI, TRI, Op);
+      for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+        setRegScore(RegNo, T, CurrScore);
+      }
+    }
   } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
     // Match the score to the destination registers.
     //
@@ -1080,6 +1116,9 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
     case KM_CNT:
       OS << "    KM_CNT(" << SR << "): ";
       break;
+    case X_CNT:
+      OS << "    X_CNT(" << SR << "): ";
+      break;
     default:
       OS << "    UNKNOWN(" << SR << "): ";
       break;
@@ -1100,8 +1139,8 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
           OS << RelScore << ":ds ";
         }
       }
-      // Also need to print sgpr scores for lgkm_cnt.
-      if (T == SmemAccessCounter) {
+      // Also need to print sgpr scores for lgkm_cnt or xcnt.
+      if (isSmemCounter(T)) {
         for (int J = 0; J <= SgprUB; J++) {
           unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T);
           if (RegScore <= LB)
@@ -1140,6 +1179,7 @@ void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
   simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
   simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
   simplifyWaitcnt(KM_CNT, Wait.KmCnt);
+  simplifyWaitcnt(X_CNT, Wait.XCnt);
 }
 
 void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
@@ -1191,6 +1231,7 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
   applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
   applyWaitcnt(BVH_CNT, Wait.BvhCnt);
   applyWaitcnt(KM_CNT, Wait.KmCnt);
+  applyXcnt(Wait);
 }
 
 void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
@@ -1207,11 +1248,29 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
   }
 }
 
+void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
+  // Wait on XCNT is redundant if we are already waiting for a load to complete.
+  // SMEM can return out of order, so only omit XCNT wait if we are waiting till
+  // zero.
+  if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
+    return applyWaitcnt(X_CNT, 0);
+
+  // If we have pending store we cannot optimize XCnt because we do not wait for
+  // stores. VMEM loads retun in order, so if we only have loads XCnt is
+  // decremented to the same number as LOADCnt.
+  if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
+      !hasPendingEvent(STORE_CNT))
+    return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
+
+  applyWaitcnt(X_CNT, Wait.XCnt);
+}
+
 // Where there are multiple types of event in the bracket of a counter,
 // the decrement may go out of order.
 bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
   // Scalar memory read always can go out of order.
-  if (T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS))
+  if ((T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
+      (T == X_CNT && hasPendingEvent(SMEM_GROUP)))
     return true;
   return hasMixedPendingEvents(T);
 }
@@ -1263,6 +1322,8 @@ static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
     return DS_CNT;
   case AMDGPU::S_WAIT_KMCNT:
     return KM_CNT;
+  case AMDGPU::S_WAIT_XCNT:
+    return X_CNT;
   default:
     return {};
   }
@@ -1427,7 +1488,8 @@ WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
 
 AMDGPU::Waitcnt
 WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
-  return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0);
+  return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
+                         ~0u /* XCNT */);
 }
 
 /// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
@@ -1909,6 +1971,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
             ScoreBrackets.determineWait(BVH_CNT, Interval, Wait);
             ScoreBrackets.clearVgprVmemTypes(Interval);
           }
+
           if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
             ScoreBrackets.determineWait(EXP_CNT, Interval, Wait);
           }
@@ -1916,6 +1979,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
         } else {
           ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);
         }
+
+        if (hasXcnt() && Op.isDef())
+          ScoreBrackets.determineWait(X_CNT, Interval, Wait);
       }
     }
   }
@@ -1958,6 +2024,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
     Wait.BvhCnt = 0;
   if (ForceEmitWaitcnt[KM_CNT])
     Wait.KmCnt = 0;
+  if (ForceEmitWaitcnt[X_CNT])
+    Wait.XCnt = 0;
 
   if (FlushVmCnt) {
     if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
@@ -2007,6 +2075,21 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
                       << "Update Instr: " << *It);
   }
 
+  // XCnt may be already consumed by a load wait.
+  if (Wait.KmCnt == 0 && Wait.XCnt != ~0u &&
+      !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
+    Wait.XCnt = ~0u;
+
+  if (Wait.LoadCnt == 0 && Wait.XCnt != ~0u &&
+      !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
+    Wait.XCnt = ~0u;
+
+  // Since the translation for VMEM addresses occur in-order, we can skip the
+  // XCnt if the current instruction is of VMEM type and has a memory dependency
+  // with another VMEM instruction in flight.
+  if (Wait.XCnt != ~0u && isVmemAccess(*It))
+    Wait.XCnt = ~0u;
+
   if (WCG->createNewWaitcnt(Block, It, Wait))
     Modified = true;
 
@@ -2096,6 +2179,11 @@ bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
   });
 }
 
+bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
+  return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) ||
+         (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
+}
+
 static bool isGFX12CacheInvOrWBInst(MachineInstr &Inst) {
   auto Opc = Inst.getOpcode();
   return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
@@ -2167,6 +2255,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
   // bracket and the destination operand scores.
   // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
 
+  bool IsVMEMAccess = false;
+  bool IsSMEMAccess = false;
   if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
     if (TII->isAlwaysGDS(Inst.getOpcode()) ||
         TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
@@ -2189,6 +2279,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
 
     if (mayAccessVMEMThroughFlat(Inst)) {
       ++FlatASCount;
+      IsVMEMAccess = true;
       ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
                                    Inst);
     }
@@ -2208,6 +2299,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
       ScoreBrackets->setPendingFlat();
   } else if (SIInstrInfo::isVMEM(Inst) &&
              !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) {
+    IsVMEMAccess = true;
     ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
                                  Inst);
 
@@ -2216,6 +2308,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
     }
   } else if (TII->isSMRD(Inst)) {
+    IsSMEMAccess = true;
     ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
   } else if (Inst.isCall()) {
     if (callWaitsOnFunctionReturn(Inst)) {
@@ -2258,6 +2351,15 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
       break;
     }
   }
+
+  if (!hasXcnt())
+    return;
+
+  if (IsVMEMAccess)
+    ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_GROUP, Inst);
+
+  if (IsSMEMAccess)
+    ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_GROUP, Inst);
 }
 
 bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
@@ -2311,9 +2413,11 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
     for (int J = 0; J <= VgprUB; J++)
       StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
 
-    if (T == SmemAccessCounter) {
+    if (isSmemCounter(T)) {
+      unsigned Idx = getSgprScoresIdx(T);
       for (int J = 0; J <= SgprUB; J++)
-        StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
+        StrictDom |=
+            mergeScore(M, SgprScores[Idx][J], Other.SgprScores[Idx][J]);
     }
   }
 
@@ -2651,6 +2755,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
   Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
   Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
   Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
+  Limits.XcntMax = AMDGPU::getXcntBitMask(IV);
 
   [[maybe_unused]] unsigned NumVGPRsMax =
       ST->getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize());
@@ -2679,7 +2784,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
       BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
           .addImm(0);
       for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
-        if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT)
+        if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
           continue;
 
         if (!ST->hasImageInsts() &&

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 0e5493259edb9..13549e5c4e58b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -129,6 +129,11 @@ unsigned getKmcntBitWidth(unsigned VersionMajor) {
   return VersionMajor >= 12 ? 5 : 0;
 }
 
+/// \returns Xcnt bit width.
+unsigned getXcntBitWidth(unsigned VersionMajor, unsigned VersionMinor) {
+  return VersionMajor == 12 && VersionMinor == 5 ? 6 : 0;
+}
+
 /// \returns shift for Loadcnt/Storecnt in combined S_WAIT instructions.
 unsigned getLoadcntStorecntBitShift(unsigned VersionMajor) {
   return VersionMajor >= 12 ? 8 : 0;
@@ -1493,6 +1498,10 @@ unsigned getKmcntBitMask(const IsaVersion &Version) {
   return (1 << getKmcntBitWidth(Version.Major)) - 1;
 }
 
+unsigned getXcntBitMask(const IsaVersion &Version) {
+  return (1 << getXcntBitWidth(Version.Major, Version.Minor)) - 1;
+}
+
 unsigned getStorecntBitMask(const IsaVersion &Version) {
   return (1 << getStorecntBitWidth(Version.Major)) - 1;
 }

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index ac7c5100be3d4..e6078d6918ac2 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -980,6 +980,7 @@ struct Waitcnt {
   unsigned SampleCnt = ~0u; // gfx12+ only.
   unsigned BvhCnt = ~0u;    // gfx12+ only.
   unsigned KmCnt = ~0u;     // gfx12+ only.
+  unsigned XCnt = ~0u;      // gfx1250.
 
   Waitcnt() = default;
   // Pre-gfx12 constructor.
@@ -988,15 +989,15 @@ struct Waitcnt {
 
   // gfx12+ constructor.
   Waitcnt(unsigned LoadCnt, unsigned ExpCnt, unsigned DsCnt, unsigned StoreCnt,
-          unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt)
+          unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt, unsigned XCnt)
       : LoadCnt(LoadCnt), ExpCnt(ExpCnt), DsCnt(DsCnt), StoreCnt(StoreCnt),
-        SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt) {}
+        SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt), XCnt(XCnt) {}
 
   bool hasWait() const { return StoreCnt != ~0u || hasWaitExceptStoreCnt(); }
 
   bool hasWaitExceptStoreCnt() const {
     return LoadCnt != ~0u || ExpCnt != ~0u || DsCnt != ~0u ||
-           SampleCnt != ~0u || BvhCnt != ~0u || KmCnt != ~0u;
+           SampleCnt != ~0u || BvhCnt != ~0u || KmCnt != ~0u || XCnt != ~0u;
   }
 
   bool hasWaitStoreCnt() const { return StoreCnt != ~0u; }
@@ -1008,7 +1009,7 @@ struct Waitcnt {
         std::min(LoadCnt, Other.LoadCnt), std::min(ExpCnt, Other.ExpCnt),
         std::min(DsCnt, Other.DsCnt), std::min(StoreCnt, Other.StoreCnt),
         std::min(SampleCnt, Other.SampleCnt), std::min(BvhCnt, Other.BvhCnt),
-        std::min(KmCnt, Other.KmCnt));
+        std::min(KmCnt, Other.KmCnt), std::min(XCnt, Other.XCnt));
   }
 };
 
@@ -1114,6 +1115,10 @@ unsigned getDscntBitMask(const IsaVersion &Version);
 /// Returns 0 for versions that do not support KMcnt
 unsigned getKmcntBitMask(const IsaVersion &Version);
 
+/// \returns Xcnt bit mask for given isa \p Version.
+/// Returns 0 for versions that do not support Xcnt.
+unsigned getXcntBitMask(const IsaVersion &Version);
+
 /// \return STOREcnt or VScnt bit mask for given isa \p Version.
 /// returns 0 for versions that do not support STOREcnt or VScnt.
 /// STOREcnt and VScnt are the same counter, the name used

diff  --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
new file mode 100644
index 0000000000000..70ea0688c8a49
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
@@ -0,0 +1,569 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-GISEL %s
+
+; Test S_WAIT_XCNT insertion for global_load/store instructions.
+; Introduced additional operations in between the clauses to have the register dependency
+; between the operands of VMEM operations and the def ops of VALU instructions that followed.
+
+define void @test_i8load_v4i8store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %ptr_b, ptr addrspace(1) %ptr_c, ptr addrspace(1) %ptr_d, ptr addrspace(1) %out) {
+; GCN-SDAG-LABEL: test_i8load_v4i8store:
+; GCN-SDAG:       ; %bb.0:
+; GCN-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GCN-SDAG-NEXT:    global_load_u8 v2, v[2:3], off
+; GCN-SDAG-NEXT:    global_load_u8 v3, v[4:5], off
+; GCN-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x2
+; GCN-SDAG-NEXT:    s_wait_xcnt 0x0
+; GCN-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v2
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x1
+; GCN-SDAG-NEXT:    v_lshlrev_b16 v2, 8, v3
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GCN-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-SDAG-NEXT:    v_or_b32_e32 v1, v3, v2
+; GCN-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GCN-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-SDAG-NEXT:    global_store_b32 v[8:9], v0, off
+; GCN-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GCN-GISEL-LABEL: test_i8load_v4i8store:
+; GCN-GISEL:       ; %bb.0:
+; GCN-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GCN-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GCN-GISEL-NEXT:    global_load_u8 v1, v[2:3], off
+; GCN-GISEL-NEXT:    global_load_u8 v2, v[4:5], off
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x1
+; GCN-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GCN-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GCN-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GCN-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GCN-GISEL-NEXT:    global_store_b32 v[8:9], v0, off
+; GCN-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %a = load i8, ptr addrspace(1) %ptr_a
+  %b = load i8, ptr addrspace(1) %ptr_b
+  %c = load i8, ptr addrspace(1) %ptr_c
+  %d = load i8, ptr addrspace(1) %ptr_d
+  %ins_0 = insertelement <4 x i8> poison, i8 %a, i32 0
+  %ins_1 = insertelement <4 x i8> %ins_0, i8 %b, i32 1
+  %ins_2 = insertelement <4 x i8> %ins_1, i8 %c, i32 2
+  %ins_3 = insertelement <4 x i8> %ins_2, i8 %c, i32 3
+  store <4 x i8> %ins_3, ptr addrspace(1) %out
+  ret void
+}
+
+define i16 @test_v7i16_load_store(ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2) {
+; GCN-SDAG-LABEL: test_v7i16_load_store:
+; GCN-SDAG:       ; %bb.0:
+; GCN-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GCN-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GCN-SDAG-NEXT:    global_load_b128 v[0:3], v[2:3], off
+; GCN-SDAG-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-SDAG-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT:    v_pk_add_u16 v10, v6, v2
+; GCN-SDAG-NEXT:    v_pk_add_u16 v11, v7, v3
+; GCN-SDAG-NEXT:    s_wait_xcnt 0x0
+; GCN-SDAG-NEXT:    v_mov_b32_e32 v2, 12
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v6, 8
+; GCN-SDAG-NEXT:    v_pk_add_u16 v4, v4, v0
+; GCN-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
+; GCN-SDAG-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-SDAG-NEXT:    v_pk_add_u16 v5, v5, v1
+; GCN-SDAG-NEXT:    s_clause 0x2
+; GCN-SDAG-NEXT:    global_store_b16 v[2:3], v11, off
+; GCN-SDAG-NEXT:    global_store_b32 v[6:7], v10, off
+; GCN-SDAG-NEXT:    global_store_b64 v[8:9], v[4:5], off
+; GCN-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GCN-GISEL-LABEL: test_v7i16_load_store:
+; GCN-GISEL:       ; %bb.0:
+; GCN-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GCN-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GCN-GISEL-NEXT:    global_load_b128 v[0:3], v[2:3], off
+; GCN-GISEL-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 2
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v12, 4
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v13, 0 :: v_dual_mov_b32 v14, 6
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v15, 0 :: v_dual_mov_b32 v16, 8
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v17, 0 :: v_dual_mov_b32 v18, 10
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v20, 12 :: v_dual_mov_b32 v19, 0
+; GCN-GISEL-NEXT:    v_mov_b32_e32 v21, 0
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GCN-GISEL-NEXT:    v_pk_add_u16 v2, v6, v2
+; GCN-GISEL-NEXT:    v_pk_add_u16 v4, v4, v0
+; GCN-GISEL-NEXT:    v_pk_add_u16 v1, v5, v1
+; GCN-GISEL-NEXT:    v_pk_add_u16 v3, v7, v3
+; GCN-GISEL-NEXT:    s_clause 0x6
+; GCN-GISEL-NEXT:    global_store_b16 v[8:9], v4, off
+; GCN-GISEL-NEXT:    global_store_d16_hi_b16 v[10:11], v4, off
+; GCN-GISEL-NEXT:    global_store_b16 v[12:13], v1, off
+; GCN-GISEL-NEXT:    global_store_d16_hi_b16 v[14:15], v1, off
+; GCN-GISEL-NEXT:    global_store_b16 v[16:17], v2, off
+; GCN-GISEL-NEXT:    global_store_d16_hi_b16 v[18:19], v2, off
+; GCN-GISEL-NEXT:    global_store_b16 v[20:21], v3, off
+; GCN-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GCN-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %vec1 = load <7 x i16>, ptr addrspace(1) %ptr1
+  %insert = insertelement <7 x i16> %vec1, i16 20, i32 4
+  %vec2 = load <7 x i16>, ptr addrspace(1) %ptr2
+  %add = add <7 x i16> %vec1, %vec2
+  store <7 x i16> %add, ptr addrspace(1) null
+  %elt = extractelement <7 x i16> %add, i32 5
+  ret i16 %elt
+}
+
+define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspace(1) %out) {
+; GCN-SDAG-LABEL: test_v64i32_load_store:
+; GCN-SDAG:       ; %bb.0:
+; GCN-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GCN-SDAG-NEXT:    s_clause 0xd
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v40, s32 offset:52
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v41, s32 offset:48
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v42, s32 offset:44
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v43, s32 offset:40
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v44, s32 offset:36
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v45, s32 offset:32
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v56, s32 offset:28
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v57, s32 offset:24
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v58, s32 offset:20
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v59, s32 offset:16
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v60, s32 offset:12
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v61, s32 offset:8
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v62, s32 offset:4
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v63, s32
+; GCN-SDAG-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:224
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT:    scratch_store_b128 off, v[6:9], s32 offset:56 ; 16-byte Folded Spill
+; GCN-SDAG-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:240
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT:    scratch_store_b128 off, v[6:9], s32 offset:72 ; 16-byte Folded Spill
+; GCN-SDAG-NEXT:    s_clause 0xd
+; GCN-SDAG-NEXT:    global_load_b128 v[10:13], v[0:1], off offset:192
+; GCN-SDAG-NEXT:    global_load_b128 v[14:17], v[0:1], off offset:208
+; GCN-SDAG-NEXT:    global_load_b128 v[18:21], v[0:1], off offset:160
+; GCN-SDAG-NEXT:    global_load_b128 v[22:25], v[0:1], off offset:176
+; GCN-SDAG-NEXT:    global_load_b128 v[26:29], v[0:1], off offset:128
+; GCN-SDAG-NEXT:    global_load_b128 v[30:33], v[0:1], off offset:144
+; GCN-SDAG-NEXT:    global_load_b128 v[34:37], v[0:1], off offset:96
+; GCN-SDAG-NEXT:    global_load_b128 v[48:51], v[0:1], off offset:112
+; GCN-SDAG-NEXT:    global_load_b128 v[52:55], v[0:1], off offset:64
+; GCN-SDAG-NEXT:    global_load_b128 v[38:41], v[0:1], off offset:80
+; GCN-SDAG-NEXT:    global_load_b128 v[42:45], v[0:1], off offset:32
+; GCN-SDAG-NEXT:    global_load_b128 v[56:59], v[0:1], off offset:48
+; GCN-SDAG-NEXT:    global_load_b128 v[60:63], v[0:1], off
+; GCN-SDAG-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:16
+; GCN-SDAG-NEXT:    scratch_load_b128 v[6:9], off, s32 offset:56 th:TH_LOAD_LU ; 16-byte Folded Reload
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:224
+; GCN-SDAG-NEXT:    scratch_load_b128 v[6:9], off, s32 offset:72 th:TH_LOAD_LU ; 16-byte Folded Reload
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT:    s_clause 0xe
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:240
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:192
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[14:17], off offset:208
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[18:21], off offset:160
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[22:25], off offset:176
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[26:29], off offset:128
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[30:33], off offset:144
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[34:37], off offset:96
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[48:51], off offset:112
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[52:55], off offset:64
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[38:41], off offset:80
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[42:45], off offset:32
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[56:59], off offset:48
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[60:63], off
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[0:3], off offset:16
+; GCN-SDAG-NEXT:    s_clause 0xd
+; GCN-SDAG-NEXT:    scratch_load_b32 v63, off, s32
+; GCN-SDAG-NEXT:    scratch_load_b32 v62, off, s32 offset:4
+; GCN-SDAG-NEXT:    scratch_load_b32 v61, off, s32 offset:8
+; GCN-SDAG-NEXT:    scratch_load_b32 v60, off, s32 offset:12
+; GCN-SDAG-NEXT:    scratch_load_b32 v59, off, s32 offset:16
+; GCN-SDAG-NEXT:    scratch_load_b32 v58, off, s32 offset:20
+; GCN-SDAG-NEXT:    scratch_load_b32 v57, off, s32 offset:24
+; GCN-SDAG-NEXT:    scratch_load_b32 v56, off, s32 offset:28
+; GCN-SDAG-NEXT:    scratch_load_b32 v45, off, s32 offset:32
+; GCN-SDAG-NEXT:    scratch_load_b32 v44, off, s32 offset:36
+; GCN-SDAG-NEXT:    scratch_load_b32 v43, off, s32 offset:40
+; GCN-SDAG-NEXT:    scratch_load_b32 v42, off, s32 offset:44
+; GCN-SDAG-NEXT:    scratch_load_b32 v41, off, s32 offset:48
+; GCN-SDAG-NEXT:    scratch_load_b32 v40, off, s32 offset:52
+; GCN-SDAG-NEXT:    s_wait_xcnt 0xe
+; GCN-SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GCN-GISEL-LABEL: test_v64i32_load_store:
+; GCN-GISEL:       ; %bb.0:
+; GCN-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GCN-GISEL-NEXT:    s_clause 0xf
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v40, s32 offset:60
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v41, s32 offset:56
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v42, s32 offset:52
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v43, s32 offset:48
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v44, s32 offset:44
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v45, s32 offset:40
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v46, s32 offset:36
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v47, s32 offset:32
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v56, s32 offset:28
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v57, s32 offset:24
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v58, s32 offset:20
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v59, s32 offset:16
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v60, s32 offset:12
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v61, s32 offset:8
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v62, s32 offset:4
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v63, s32
+; GCN-GISEL-NEXT:    s_wait_xcnt 0x8
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v46, v3 :: v_dual_mov_b32 v47, v4
+; GCN-GISEL-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:32
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GCN-GISEL-NEXT:    scratch_store_b128 off, v[2:5], s32 offset:80 ; 16-byte Folded Spill
+; GCN-GISEL-NEXT:    s_clause 0xe
+; GCN-GISEL-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:48
+; GCN-GISEL-NEXT:    global_load_b128 v[10:13], v[0:1], off offset:64
+; GCN-GISEL-NEXT:    global_load_b128 v[14:17], v[0:1], off offset:80
+; GCN-GISEL-NEXT:    global_load_b128 v[18:21], v[0:1], off offset:96
+; GCN-GISEL-NEXT:    global_load_b128 v[22:25], v[0:1], off offset:112
+; GCN-GISEL-NEXT:    global_load_b128 v[26:29], v[0:1], off offset:128
+; GCN-GISEL-NEXT:    global_load_b128 v[30:33], v[0:1], off offset:144
+; GCN-GISEL-NEXT:    global_load_b128 v[34:37], v[0:1], off offset:160
+; GCN-GISEL-NEXT:    global_load_b128 v[48:51], v[0:1], off offset:176
+; GCN-GISEL-NEXT:    global_load_b128 v[52:55], v[0:1], off offset:192
+; GCN-GISEL-NEXT:    global_load_b128 v[38:41], v[0:1], off offset:208
+; GCN-GISEL-NEXT:    global_load_b128 v[42:45], v[0:1], off offset:224
+; GCN-GISEL-NEXT:    global_load_b128 v[56:59], v[0:1], off
+; GCN-GISEL-NEXT:    global_load_b128 v[60:63], v[0:1], off offset:16
+; GCN-GISEL-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:240
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GCN-GISEL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:64 ; 16-byte Folded Spill
+; GCN-GISEL-NEXT:    scratch_load_b128 v[0:3], off, s32 offset:80 th:TH_LOAD_LU ; 16-byte Folded Reload
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GCN-GISEL-NEXT:    s_clause 0xe
+; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[0:3], off offset:32
+; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[6:9], off offset:48
+; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[10:13], off offset:64
+; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[14:17], off offset:80
+; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[18:21], off offset:96
+; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[22:25], off offset:112
+; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[26:29], off offset:128
+; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[30:33], off offset:144
+; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[34:37], off offset:160
+; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[48:51], off offset:176
+; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[52:55], off offset:192
+; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[38:41], off offset:208
+; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[42:45], off offset:224
+; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[56:59], off
+; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[60:63], off offset:16
+; GCN-GISEL-NEXT:    scratch_load_b128 v[0:3], off, s32 offset:64 th:TH_LOAD_LU ; 16-byte Folded Reload
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[0:3], off offset:240
+; GCN-GISEL-NEXT:    s_wait_xcnt 0x0
+; GCN-GISEL-NEXT:    v_mov_b32_e32 v0, v62
+; GCN-GISEL-NEXT:    s_clause 0xf
+; GCN-GISEL-NEXT:    scratch_load_b32 v63, off, s32
+; GCN-GISEL-NEXT:    scratch_load_b32 v62, off, s32 offset:4
+; GCN-GISEL-NEXT:    scratch_load_b32 v61, off, s32 offset:8
+; GCN-GISEL-NEXT:    scratch_load_b32 v60, off, s32 offset:12
+; GCN-GISEL-NEXT:    scratch_load_b32 v59, off, s32 offset:16
+; GCN-GISEL-NEXT:    scratch_load_b32 v58, off, s32 offset:20
+; GCN-GISEL-NEXT:    scratch_load_b32 v57, off, s32 offset:24
+; GCN-GISEL-NEXT:    scratch_load_b32 v56, off, s32 offset:28
+; GCN-GISEL-NEXT:    scratch_load_b32 v47, off, s32 offset:32
+; GCN-GISEL-NEXT:    scratch_load_b32 v46, off, s32 offset:36
+; GCN-GISEL-NEXT:    scratch_load_b32 v45, off, s32 offset:40
+; GCN-GISEL-NEXT:    scratch_load_b32 v44, off, s32 offset:44
+; GCN-GISEL-NEXT:    scratch_load_b32 v43, off, s32 offset:48
+; GCN-GISEL-NEXT:    scratch_load_b32 v42, off, s32 offset:52
+; GCN-GISEL-NEXT:    scratch_load_b32 v41, off, s32 offset:56
+; GCN-GISEL-NEXT:    scratch_load_b32 v40, off, s32 offset:60
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GCN-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %vec = load <64 x i32>, ptr addrspace(1) %ptr
+  store <64 x i32> %vec, ptr addrspace(1) %out, align 4
+  %elt = extractelement <64 x i32> %vec, i32 6
+  ret i32 %elt
+}
+
+define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %ptr_b, ptr addrspace(1) %out) {
+; GCN-SDAG-LABEL: test_v16i64_load_store:
+; GCN-SDAG:       ; %bb.0:
+; GCN-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GCN-SDAG-NEXT:    s_clause 0x3
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v40, s32 offset:12
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v41, s32 offset:8
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v42, s32 offset:4
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v43, s32
+; GCN-SDAG-NEXT:    s_clause 0x7
+; GCN-SDAG-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:112
+; GCN-SDAG-NEXT:    global_load_b128 v[10:13], v[0:1], off offset:96
+; GCN-SDAG-NEXT:    global_load_b128 v[18:21], v[0:1], off offset:80
+; GCN-SDAG-NEXT:    global_load_b128 v[34:37], v[0:1], off offset:48
+; GCN-SDAG-NEXT:    global_load_b128 v[30:33], v[0:1], off offset:32
+; GCN-SDAG-NEXT:    global_load_b128 v[22:25], v[0:1], off offset:16
+; GCN-SDAG-NEXT:    global_load_b128 v[26:29], v[0:1], off
+; GCN-SDAG-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:64
+; GCN-SDAG-NEXT:    v_mov_b32_e32 v16, 0x70
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v17, 0 :: v_dual_mov_b32 v50, 0x60
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v51, 0 :: v_dual_mov_b32 v52, 48
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v38, 0x50 :: v_dual_mov_b32 v53, 0
+; GCN-SDAG-NEXT:    v_mov_b32_e32 v54, 32
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v14, 0xc8 :: v_dual_mov_b32 v15, 0
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v39, 0 :: v_dual_mov_b32 v48, 64
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v55, 0 :: v_dual_mov_b32 v40, 16
+; GCN-SDAG-NEXT:    v_mov_b32_e32 v49, 0
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v41, 0 :: v_dual_mov_b32 v42, 0
+; GCN-SDAG-NEXT:    v_mov_b32_e32 v43, 0
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x7
+; GCN-SDAG-NEXT:    global_store_b128 v[16:17], v[6:9], off
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x6
+; GCN-SDAG-NEXT:    global_store_b128 v[50:51], v[10:13], off
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x5
+; GCN-SDAG-NEXT:    s_wait_xcnt 0x1
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v16, v20 :: v_dual_mov_b32 v17, v21
+; GCN-SDAG-NEXT:    s_wait_xcnt 0x0
+; GCN-SDAG-NEXT:    v_lshl_add_u64 v[12:13], v[12:13], 0, v[12:13]
+; GCN-SDAG-NEXT:    v_lshl_add_u64 v[10:11], v[10:11], 0, v[10:11]
+; GCN-SDAG-NEXT:    v_lshl_add_u64 v[8:9], v[8:9], 0, v[8:9]
+; GCN-SDAG-NEXT:    v_lshl_add_u64 v[6:7], v[6:7], 0, v[6:7]
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x4
+; GCN-SDAG-NEXT:    global_store_b128 v[52:53], v[34:37], off
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x3
+; GCN-SDAG-NEXT:    global_store_b128 v[54:55], v[30:33], off
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x2
+; GCN-SDAG-NEXT:    global_store_b128 v[40:41], v[22:25], off
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x1
+; GCN-SDAG-NEXT:    global_store_b128 v[42:43], v[26:29], off
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT:    s_wait_xcnt 0x3
+; GCN-SDAG-NEXT:    v_lshl_add_u64 v[52:53], v[2:3], 0, v[2:3]
+; GCN-SDAG-NEXT:    v_lshl_add_u64 v[50:51], v[0:1], 0, v[0:1]
+; GCN-SDAG-NEXT:    s_wait_xcnt 0x1
+; GCN-SDAG-NEXT:    v_lshl_add_u64 v[24:25], v[24:25], 0, v[24:25]
+; GCN-SDAG-NEXT:    v_lshl_add_u64 v[22:23], v[22:23], 0, v[22:23]
+; GCN-SDAG-NEXT:    s_wait_xcnt 0x0
+; GCN-SDAG-NEXT:    v_lshl_add_u64 v[28:29], v[28:29], 0, v[28:29]
+; GCN-SDAG-NEXT:    v_lshl_add_u64 v[26:27], v[26:27], 0, v[26:27]
+; GCN-SDAG-NEXT:    v_lshl_add_u64 v[36:37], v[36:37], 0, v[36:37]
+; GCN-SDAG-NEXT:    v_lshl_add_u64 v[34:35], v[34:35], 0, v[34:35]
+; GCN-SDAG-NEXT:    v_lshl_add_u64 v[32:33], v[32:33], 0, 0x64
+; GCN-SDAG-NEXT:    v_lshl_add_u64 v[30:31], v[30:31], 0, v[30:31]
+; GCN-SDAG-NEXT:    v_lshl_add_u64 v[20:21], v[20:21], 0, v[20:21]
+; GCN-SDAG-NEXT:    v_lshl_add_u64 v[18:19], v[18:19], 0, 0xc8
+; GCN-SDAG-NEXT:    s_clause 0x1
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[14:17], off
+; GCN-SDAG-NEXT:    global_store_b128 v[48:49], v[0:3], off
+; GCN-SDAG-NEXT:    s_clause 0x7
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:96
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:112
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[50:53], off offset:64
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[18:21], off offset:80
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[30:33], off offset:32
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[34:37], off offset:48
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[26:29], off
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[22:25], off offset:16
+; GCN-SDAG-NEXT:    s_clause 0x3
+; GCN-SDAG-NEXT:    scratch_load_b32 v43, off, s32
+; GCN-SDAG-NEXT:    scratch_load_b32 v42, off, s32 offset:4
+; GCN-SDAG-NEXT:    scratch_load_b32 v41, off, s32 offset:8
+; GCN-SDAG-NEXT:    scratch_load_b32 v40, off, s32 offset:12
+; GCN-SDAG-NEXT:    s_wait_xcnt 0xc
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v0, v28 :: v_dual_mov_b32 v1, v29
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GCN-GISEL-LABEL: test_v16i64_load_store:
+; GCN-GISEL:       ; %bb.0:
+; GCN-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GCN-GISEL-NEXT:    s_clause 0x5
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v40, s32 offset:20
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v41, s32 offset:16
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v42, s32 offset:12
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v43, s32 offset:8
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v44, s32 offset:4
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v45, s32
+; GCN-GISEL-NEXT:    s_clause 0x7
+; GCN-GISEL-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:80
+; GCN-GISEL-NEXT:    global_load_b128 v[10:13], v[0:1], off
+; GCN-GISEL-NEXT:    global_load_b128 v[14:17], v[0:1], off offset:16
+; GCN-GISEL-NEXT:    global_load_b128 v[18:21], v[0:1], off offset:32
+; GCN-GISEL-NEXT:    global_load_b128 v[22:25], v[0:1], off offset:48
+; GCN-GISEL-NEXT:    global_load_b128 v[26:29], v[0:1], off offset:96
+; GCN-GISEL-NEXT:    global_load_b128 v[30:33], v[0:1], off offset:112
+; GCN-GISEL-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:64
+; GCN-GISEL-NEXT:    v_mov_b32_e32 v34, 0xc8
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v35, 0 :: v_dual_mov_b32 v38, 0
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v39, 0 :: v_dual_mov_b32 v48, 16
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v49, 0 :: v_dual_mov_b32 v50, 32
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v52, 48 :: v_dual_mov_b32 v51, 0
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v53, 0 :: v_dual_mov_b32 v54, 64
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v40, 0x50 :: v_dual_mov_b32 v55, 0
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v41, 0 :: v_dual_mov_b32 v42, 0x60
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v44, 0x70 :: v_dual_mov_b32 v43, 0
+; GCN-GISEL-NEXT:    v_mov_b32_e32 v45, 0
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x7
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v37, v9 :: v_dual_mov_b32 v36, v8
+; GCN-GISEL-NEXT:    v_lshl_add_u64 v[6:7], v[6:7], 0, 0xc8
+; GCN-GISEL-NEXT:    v_lshl_add_u64 v[8:9], v[8:9], 0, v[8:9]
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x6
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[10:13], off
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x5
+; GCN-GISEL-NEXT:    global_store_b128 v[48:49], v[14:17], off
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x4
+; GCN-GISEL-NEXT:    global_store_b128 v[50:51], v[18:21], off
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x3
+; GCN-GISEL-NEXT:    global_store_b128 v[52:53], v[22:25], off
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x2
+; GCN-GISEL-NEXT:    global_store_b128 v[42:43], v[26:29], off
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x1
+; GCN-GISEL-NEXT:    global_store_b128 v[44:45], v[30:33], off
+; GCN-GISEL-NEXT:    s_wait_xcnt 0x5
+; GCN-GISEL-NEXT:    v_lshl_add_u64 v[10:11], v[10:11], 0, v[10:11]
+; GCN-GISEL-NEXT:    v_lshl_add_u64 v[12:13], v[12:13], 0, v[12:13]
+; GCN-GISEL-NEXT:    s_wait_xcnt 0x4
+; GCN-GISEL-NEXT:    v_lshl_add_u64 v[14:15], v[14:15], 0, v[14:15]
+; GCN-GISEL-NEXT:    v_lshl_add_u64 v[16:17], v[16:17], 0, v[16:17]
+; GCN-GISEL-NEXT:    s_wait_xcnt 0x3
+; GCN-GISEL-NEXT:    v_lshl_add_u64 v[18:19], v[18:19], 0, v[18:19]
+; GCN-GISEL-NEXT:    v_lshl_add_u64 v[20:21], v[20:21], 0, 0x64
+; GCN-GISEL-NEXT:    s_wait_xcnt 0x2
+; GCN-GISEL-NEXT:    v_lshl_add_u64 v[22:23], v[22:23], 0, v[22:23]
+; GCN-GISEL-NEXT:    v_lshl_add_u64 v[24:25], v[24:25], 0, v[24:25]
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GCN-GISEL-NEXT:    v_lshl_add_u64 v[48:49], v[0:1], 0, v[0:1]
+; GCN-GISEL-NEXT:    v_lshl_add_u64 v[50:51], v[2:3], 0, v[2:3]
+; GCN-GISEL-NEXT:    s_wait_xcnt 0x1
+; GCN-GISEL-NEXT:    v_lshl_add_u64 v[26:27], v[26:27], 0, v[26:27]
+; GCN-GISEL-NEXT:    v_lshl_add_u64 v[28:29], v[28:29], 0, v[28:29]
+; GCN-GISEL-NEXT:    s_wait_xcnt 0x0
+; GCN-GISEL-NEXT:    v_lshl_add_u64 v[30:31], v[30:31], 0, v[30:31]
+; GCN-GISEL-NEXT:    v_lshl_add_u64 v[32:33], v[32:33], 0, v[32:33]
+; GCN-GISEL-NEXT:    s_clause 0x1
+; GCN-GISEL-NEXT:    global_store_b128 v[54:55], v[0:3], off
+; GCN-GISEL-NEXT:    global_store_b128 v[40:41], v[34:37], off
+; GCN-GISEL-NEXT:    s_clause 0x7
+; GCN-GISEL-NEXT:    global_store_b128 v[4:5], v[10:13], off
+; GCN-GISEL-NEXT:    global_store_b128 v[4:5], v[14:17], off offset:16
+; GCN-GISEL-NEXT:    global_store_b128 v[4:5], v[18:21], off offset:32
+; GCN-GISEL-NEXT:    global_store_b128 v[4:5], v[22:25], off offset:48
+; GCN-GISEL-NEXT:    global_store_b128 v[4:5], v[48:51], off offset:64
+; GCN-GISEL-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:80
+; GCN-GISEL-NEXT:    global_store_b128 v[4:5], v[26:29], off offset:96
+; GCN-GISEL-NEXT:    global_store_b128 v[4:5], v[30:33], off offset:112
+; GCN-GISEL-NEXT:    s_clause 0x5
+; GCN-GISEL-NEXT:    scratch_load_b32 v45, off, s32
+; GCN-GISEL-NEXT:    scratch_load_b32 v44, off, s32 offset:4
+; GCN-GISEL-NEXT:    scratch_load_b32 v43, off, s32 offset:8
+; GCN-GISEL-NEXT:    scratch_load_b32 v42, off, s32 offset:12
+; GCN-GISEL-NEXT:    scratch_load_b32 v41, off, s32 offset:16
+; GCN-GISEL-NEXT:    scratch_load_b32 v40, off, s32 offset:20
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v0, v12 :: v_dual_mov_b32 v1, v13
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GCN-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %a = load <16 x i64>, ptr addrspace(1) %ptr_a, align 4
+  %in_a = insertelement <16 x i64> %a, i64 100, i32 5
+  store <16 x i64> %in_a, ptr addrspace(1) null
+  %b = load <16 x i64>, ptr addrspace(1) %ptr_b, align 4
+  %in_b = insertelement <16 x i64> %a, i64 200, i32 10
+  store <16 x i64> %in_b, ptr addrspace(1) null
+  %add = add <16 x i64> %in_a, %in_b
+  store <16 x i64> %add, ptr addrspace(1) %out, align 4
+  %elt = extractelement <16 x i64> %add, i32 1
+  ret i64 %elt
+}
+
+define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2, ptr addrspace(1) %out) {
+; GCN-SDAG-LABEL: test_v7i16_load_store_kernel:
+; GCN-SDAG:       ; %bb.0:
+; GCN-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GCN-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GCN-SDAG-NEXT:    v_mov_b32_e32 v8, 12
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 8
+; GCN-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v11, 0 :: v_dual_lshlrev_b32 v4, 4, v0
+; GCN-SDAG-NEXT:    s_wait_xcnt 0x0
+; GCN-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GCN-SDAG-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-SDAG-NEXT:    v_mov_b32_e32 v13, 0
+; GCN-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GCN-SDAG-NEXT:    s_clause 0x1
+; GCN-SDAG-NEXT:    global_load_b128 v[0:3], v4, s[0:1]
+; GCN-SDAG-NEXT:    global_load_b128 v[4:7], v4, s[2:3]
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT:    v_pk_add_u16 v3, v3, v7
+; GCN-SDAG-NEXT:    v_pk_add_u16 v2, v2, v6
+; GCN-SDAG-NEXT:    v_pk_add_u16 v1, v1, v5
+; GCN-SDAG-NEXT:    v_pk_add_u16 v0, v0, v4
+; GCN-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-SDAG-NEXT:    s_clause 0x2
+; GCN-SDAG-NEXT:    global_store_b16 v[8:9], v3, off
+; GCN-SDAG-NEXT:    global_store_b32 v[10:11], v2, off
+; GCN-SDAG-NEXT:    global_store_b64 v[12:13], v[0:1], off
+; GCN-SDAG-NEXT:    global_store_d16_hi_b16 v4, v2, s[4:5]
+; GCN-SDAG-NEXT:    s_endpgm
+;
+; GCN-GISEL-LABEL: test_v7i16_load_store_kernel:
+; GCN-GISEL:       ; %bb.0:
+; GCN-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GCN-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GCN-GISEL-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 2
+; GCN-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v11, 0 :: v_dual_lshlrev_b32 v4, 4, v0
+; GCN-GISEL-NEXT:    s_wait_xcnt 0x0
+; GCN-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GCN-GISEL-NEXT:    v_mov_b32_e32 v12, 4
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v13, 0 :: v_dual_mov_b32 v14, 6
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v15, 0 :: v_dual_mov_b32 v16, 8
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v17, 0 :: v_dual_mov_b32 v18, 10
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v20, 12 :: v_dual_mov_b32 v19, 0
+; GCN-GISEL-NEXT:    v_mov_b32_e32 v21, 0
+; GCN-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GCN-GISEL-NEXT:    s_clause 0x1
+; GCN-GISEL-NEXT:    global_load_b128 v[0:3], v4, s[0:1]
+; GCN-GISEL-NEXT:    global_load_b128 v[4:7], v4, s[2:3]
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GCN-GISEL-NEXT:    v_pk_add_u16 v0, v0, v4
+; GCN-GISEL-NEXT:    v_pk_add_u16 v1, v1, v5
+; GCN-GISEL-NEXT:    v_pk_add_u16 v2, v2, v6
+; GCN-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-GISEL-NEXT:    v_pk_add_u16 v3, v3, v7
+; GCN-GISEL-NEXT:    s_clause 0x6
+; GCN-GISEL-NEXT:    global_store_b16 v[8:9], v0, off
+; GCN-GISEL-NEXT:    global_store_d16_hi_b16 v[10:11], v0, off
+; GCN-GISEL-NEXT:    global_store_b16 v[12:13], v1, off
+; GCN-GISEL-NEXT:    global_store_d16_hi_b16 v[14:15], v1, off
+; GCN-GISEL-NEXT:    global_store_b16 v[16:17], v2, off
+; GCN-GISEL-NEXT:    global_store_d16_hi_b16 v[18:19], v2, off
+; GCN-GISEL-NEXT:    global_store_b16 v[20:21], v3, off
+; GCN-GISEL-NEXT:    global_store_d16_hi_b16 v4, v2, s[4:5]
+; GCN-GISEL-NEXT:    s_endpgm
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr inbounds <7 x i16>, ptr addrspace(1) %ptr1, i32 %tid
+  %gep2 = getelementptr inbounds <7 x i16>, ptr addrspace(1) %ptr2, i32 %tid
+  %vec1 = load <7 x i16>, ptr addrspace(1) %gep1
+  %insert = insertelement <7 x i16> %vec1, i16 20, i32 4
+  %vec2 = load <7 x i16>, ptr addrspace(1) %gep2
+  %add = add <7 x i16> %vec1, %vec2
+  store <7 x i16> %add, ptr addrspace(1) null
+  %elt = extractelement <7 x i16> %add, i32 5
+  store i16 %elt, ptr addrspace(1) %out
+  ret void
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
new file mode 100644
index 0000000000000..73b994ab2ab8c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
@@ -0,0 +1,922 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GCN %s
+
+---
+name: vmem_scratch_load
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr0
+    ; GCN-LABEL: name: vmem_scratch_load
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: S_WAIT_LOADCNT 0
+    ; GCN-NEXT: $vgpr2 = V_LSHLREV_B32_e64 16, $vgpr1, implicit $exec
+    ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+    ; GCN-NEXT: $vgpr1 = nofpexcept V_ADD_F32_e32 killed $vgpr2, killed $vgpr2, implicit $mode, implicit $exec
+    $vgpr1 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr2 = V_LSHLREV_B32_e64 16, $vgpr1, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr1 = nofpexcept V_ADD_F32_e32 killed $vgpr2, killed $vgpr2, implicit $mode, implicit $exec
+...
+
+---
+name: vmem_buffer_load_dword_offset
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GCN-LABEL: name: vmem_buffer_load_dword_offset
+    ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: S_WAIT_LOADCNT 0
+    ; GCN-NEXT: $vgpr2 = V_LSHLREV_B32_e64 16, $vgpr1, implicit $exec
+    ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+    ; GCN-NEXT: $vgpr1 = nofpexcept V_ADD_F32_e32 killed $vgpr2, killed $vgpr2, implicit $mode, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+    $vgpr2 = V_LSHLREV_B32_e64 16, $vgpr1, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr1 = nofpexcept V_ADD_F32_e32 killed $vgpr2, killed $vgpr2, implicit $mode, implicit $exec
+...
+
+---
+name: vmem_buffer_load_addr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GCN-LABEL: name: vmem_buffer_load_addr
+    ; GCN: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_ADDR64 $vgpr0_vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: S_WAIT_LOADCNT 0
+    ; GCN-NEXT: $vgpr3 = V_LSHLREV_B32_e64 16, $vgpr2, implicit $exec
+    ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+    ; GCN-NEXT: $vgpr1 = nofpexcept V_ADD_F32_e32 killed $vgpr3, killed $vgpr4, implicit $mode, implicit $exec
+    $vgpr2 = BUFFER_LOAD_DWORD_ADDR64 $vgpr0_vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+    $vgpr3 = V_LSHLREV_B32_e64 16, $vgpr2, implicit $exec
+    $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr1 = nofpexcept V_ADD_F32_e32 killed $vgpr3, killed $vgpr4, implicit $mode, implicit $exec
+...
+
+---
+name: vmem_flat_load
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GCN-LABEL: name: vmem_flat_load
+    ; GCN: liveins: $vgpr0, $vgpr1
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
+    ; GCN-NEXT: S_WAIT_LOADCNT 0
+    ; GCN-NEXT: $vgpr3 = V_LSHLREV_B32_e64 16, $vgpr2, implicit $exec
+    ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+    ; GCN-NEXT: $vgpr0 = nofpexcept V_ADD_F32_e32 killed $vgpr3, killed $vgpr4, implicit $mode, implicit $exec
+    $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
+    $vgpr3 = V_LSHLREV_B32_e64 16, $vgpr2, implicit $exec
+    $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr0 = nofpexcept V_ADD_F32_e32 killed $vgpr3, killed $vgpr4, implicit $mode, implicit $exec
+...
+
+---
+name: vmem_global_load
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GCN-LABEL: name: vmem_global_load
+    ; GCN: liveins: $vgpr0, $vgpr1
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1)
+    ; GCN-NEXT: S_WAIT_LOADCNT 0
+    ; GCN-NEXT: $vgpr3 = V_LSHLREV_B32_e64 16, $vgpr2, implicit $exec
+    ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+    ; GCN-NEXT: $vgpr0 = nofpexcept V_ADD_F32_e32 killed $vgpr3, killed $vgpr4, implicit $mode, implicit $exec
+    $vgpr2 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec:: (load (s32) from `float addrspace(1)* undef`, align 4, addrspace 1)
+    $vgpr3 = V_LSHLREV_B32_e64 16, $vgpr2, implicit $exec
+    $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr0 = nofpexcept V_ADD_F32_e32 killed $vgpr3, killed $vgpr4, implicit $mode, implicit $exec
+...
+
+---
+name: vmem_global_store
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GCN-LABEL: name: vmem_global_store
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+    ; GCN-NEXT: S_WAIT_XCNT 0
+    ; GCN-NEXT: $vgpr2 = V_LSHLREV_B32_e64 16, $vgpr3, implicit $exec
+    GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+    $vgpr2 = V_LSHLREV_B32_e64 16, $vgpr3, implicit $exec
+...
+
+---
+name: vmem_buffer_store
+tracksRegLiveness: true
+machineFunctionInfo:
+  scratchRSrcReg:  $sgpr0_sgpr1_sgpr2_sgpr3
+  stackPtrOffsetReg:  $sgpr32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GCN-LABEL: name: vmem_buffer_store
+    ; GCN: liveins: $vgpr0, $vgpr1
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: S_WAIT_LOADCNT_DSCNT 0
+    ; GCN-NEXT: S_WAIT_KMCNT 0
+    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: S_WAIT_XCNT 0
+    ; GCN-NEXT: $vgpr0 = V_LSHLREV_B32_e64 16, $vgpr1, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec
+    $vgpr0 = V_LSHLREV_B32_e64 16, $vgpr1, implicit $exec
+...
+
+---
+name: vmem_scratch_store
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GCN-LABEL: name: vmem_scratch_store
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: S_WAIT_LOADCNT_DSCNT 0
+    ; GCN-NEXT: S_WAIT_KMCNT 0
+    ; GCN-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: S_WAIT_XCNT 0
+    ; GCN-NEXT: $vgpr1 = V_LSHLREV_B32_e64 16, $vgpr2, implicit $exec
+    SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr1 = V_LSHLREV_B32_e64 16, $vgpr2, implicit $exec
+...
+
+---
+name: smem_load
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $sgpr2_sgpr3
+    ; GCN-LABEL: name: smem_load
+    ; GCN: liveins: $sgpr2_sgpr3
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM $sgpr2_sgpr3, 0, 0 :: (load (s64), addrspace 4)
+    ; GCN-NEXT: S_WAIT_XCNT 0
+    ; GCN-NEXT: $sgpr2 = S_MOV_B32 0
+     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM $sgpr2_sgpr3, 0, 0 :: (load (s64), addrspace 4)
+     $sgpr2 = S_MOV_B32 0
+...
+
+---
+name: smem_store
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+     liveins: $sgpr0, $sgpr2, $sgpr3
+    ; GCN-LABEL: name: smem_store
+    ; GCN: liveins: $sgpr0, $sgpr2, $sgpr3
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: S_STORE_DWORD_IMM $sgpr0, $sgpr2_sgpr3, 0, 0
+    ; GCN-NEXT: S_WAIT_XCNT 0
+    ; GCN-NEXT: $sgpr3 = S_MOV_B32 0
+     S_STORE_DWORD_IMM $sgpr0, $sgpr2_sgpr3, 0, 0
+     $sgpr3 = S_MOV_B32 0
+...
+
+# 4 global_load instructions together form a load-group.
+
+---
+name: vmem_load_group
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr10
+    ; GCN-LABEL: name: vmem_load_group
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr10
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; GCN-NEXT: S_WAIT_LOADCNT 0
+    ; GCN-NEXT: $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GCN-NEXT: $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 8, 0, implicit $exec
+    ; GCN-NEXT: $vgpr6_vgpr7 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 16, 0, implicit $exec
+    ; GCN-NEXT: $vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 24, 0, implicit $exec
+    ; GCN-NEXT: S_WAIT_LOADCNT 2
+    ; GCN-NEXT: $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr4, implicit $exec
+    $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
+    $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 8, 0, implicit $exec
+    $vgpr6_vgpr7 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 16, 0, implicit $exec
+    $vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 24, 0, implicit $exec
+    $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr4, implicit $exec
+...
+
+# The contiguous stores form a single group.
+
+---
+name: vmem_store_group
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+    ; GCN-LABEL: name: vmem_store_group
+    ; GCN: liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; GCN-NEXT: S_WAIT_LOADCNT 0
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr7, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr8, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr9, 0, 0, implicit $exec
+    ; GCN-NEXT: $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr6, implicit $exec
+    $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr7, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr8, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr9, 0, 0, implicit $exec
+    $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr6, implicit $exec
+...
+
+---
+name: smem_load_group
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+    ; GCN-LABEL: name: smem_load_group
+    ; GCN: liveins: $sgpr0_sgpr1
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4)
+    ; GCN-NEXT: $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4)
+    ; GCN-NEXT: $sgpr6_sgpr7 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4)
+    ; GCN-NEXT: $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4)
+    ; GCN-NEXT: S_WAIT_KMCNT 0
+    ; GCN-NEXT: $sgpr2 = S_MOV_B32 0
+     $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4)
+     $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4)
+     $sgpr6_sgpr7 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4)
+     $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4)
+     $sgpr2 = S_MOV_B32 0
+...
+
+---
+name: smem_store_group
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+     liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5
+    ; GCN-LABEL: name: smem_store_group
+    ; GCN: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: S_STORE_DWORD_IMM $sgpr2, $sgpr0_sgpr1, 0, 0
+    ; GCN-NEXT: S_STORE_DWORD_IMM $sgpr3, $sgpr0_sgpr1, 0, 0
+    ; GCN-NEXT: S_STORE_DWORD_IMM $sgpr4, $sgpr0_sgpr1, 0, 0
+    ; GCN-NEXT: S_STORE_DWORD_IMM $sgpr5, $sgpr0_sgpr1, 0, 0
+    ; GCN-NEXT: S_WAIT_XCNT 0
+    ; GCN-NEXT: $sgpr2 = S_MOV_B32 0
+    ; GCN-NEXT: $sgpr3 = S_MOV_B32 0
+     S_STORE_DWORD_IMM $sgpr2, $sgpr0_sgpr1, 0, 0
+     S_STORE_DWORD_IMM $sgpr3, $sgpr0_sgpr1, 0, 0
+     S_STORE_DWORD_IMM $sgpr4, $sgpr0_sgpr1, 0, 0
+     S_STORE_DWORD_IMM $sgpr5, $sgpr0_sgpr1, 0, 0
+     $sgpr2 = S_MOV_B32 0
+     $sgpr3 = S_MOV_B32 0
+...
+
+# The four global_load instructions form two separate groups due to the interveing s_nop.
+
+---
+name: vmem_loads_with_an_intervening_nop
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr10
+    ; GCN-LABEL: name: vmem_loads_with_an_intervening_nop
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr10
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; GCN-NEXT: S_WAIT_LOADCNT 0
+    ; GCN-NEXT: $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GCN-NEXT: $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 8, 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0
+    ; GCN-NEXT: $vgpr6_vgpr7 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 16, 0, implicit $exec
+    ; GCN-NEXT: $vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 24, 0, implicit $exec
+    ; GCN-NEXT: S_WAIT_LOADCNT 2
+    ; GCN-NEXT: $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr4, implicit $exec
+    $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
+    $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 8, 0, implicit $exec
+    S_NOP 0
+    $vgpr6_vgpr7 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 16, 0, implicit $exec
+    $vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 24, 0, implicit $exec
+    $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr4, implicit $exec
+...
+
+---
+name: vmem_contiguous_loads_with_an_intervening_store
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr10
+    ; GCN-LABEL: name: vmem_contiguous_loads_with_an_intervening_store
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr10
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; GCN-NEXT: S_WAIT_LOADCNT 0
+    ; GCN-NEXT: $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GCN-NEXT: $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 8, 0, implicit $exec
+    ; GCN-NEXT: S_WAIT_LOADCNT 1
+    ; GCN-NEXT: GLOBAL_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr2_vgpr3, 32, 0, implicit $exec
+    ; GCN-NEXT: $vgpr6_vgpr7 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 16, 0, implicit $exec
+    ; GCN-NEXT: $vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 24, 0, implicit $exec
+    ; GCN-NEXT: S_WAIT_LOADCNT 2
+    ; GCN-NEXT: $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr4, implicit $exec
+    $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
+    $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 8, 0, implicit $exec
+    GLOBAL_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr2_vgpr3, 32, 0, implicit $exec
+    $vgpr6_vgpr7 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 16, 0, implicit $exec
+    $vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 24, 0, implicit $exec
+    $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr4, implicit $exec
+...
+
+---
+name: vmem_stores_with_intervening_nop
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+    ; GCN-LABEL: name: vmem_stores_with_intervening_nop
+    ; GCN: liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; GCN-NEXT: S_WAIT_LOADCNT 0
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec
+    ; GCN-NEXT: S_NOP 0
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr7, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr8, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr9, 0, 0, implicit $exec
+    ; GCN-NEXT: $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr6, implicit $exec
+    $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec
+    S_NOP 0
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr7, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr8, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr9, 0, 0, implicit $exec
+    $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr6, implicit $exec
+...
+
+# The intervening load breaks the store group and form two distict store groups.
+
+---
+name: vmem_contiguous_stores_with_an_intervening_load
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+    ; GCN-LABEL: name: vmem_contiguous_stores_with_an_intervening_load
+    ; GCN: liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; GCN-NEXT: S_WAIT_LOADCNT 0
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec
+    ; GCN-NEXT: $vgpr11 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr7, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr8, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr9, 0, 0, implicit $exec
+    ; GCN-NEXT: $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr6, implicit $exec
+    $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec
+    $vgpr11 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr7, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr8, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr9, 0, 0, implicit $exec
+    $vgpr10 = V_LSHLREV_B32_e64 16, $vgpr6, implicit $exec
+...
+
+# Atomic operations should not form a group. But they are memory instructions and should increment
+# the xcnt counter value as they might cause register dependnecy. This test ensures S_WAIT_XCNT
+# insertion for such cases.
+
+---
+name: atomic_op
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GCN-LABEL: name: atomic_op
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $vgpr2 = BUFFER_ATOMIC_ADD_ADDR64_RTN $vgpr2, $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 1, implicit $exec :: (load store (s32), addrspace 1)
+    ; GCN-NEXT: GLOBAL_ATOMIC_ADD_F32 $vgpr4_vgpr5, killed renamable $vgpr3, 0, 0, implicit $exec :: (load store syncscope("agent-one-as") monotonic monotonic (s32), addrspace 1)
+    ; GCN-NEXT: $vgpr6 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: S_WAIT_XCNT 2
+    ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    ; GCN-NEXT: S_WAIT_XCNT 1
+    ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr2 = BUFFER_ATOMIC_ADD_ADDR64_RTN $vgpr2, $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 1, implicit $exec :: (load store (s32), addrspace 1)
+    GLOBAL_ATOMIC_ADD_F32 $vgpr4_vgpr5, killed renamable $vgpr3, 0, 0, implicit $exec :: (load store syncscope("agent-one-as") monotonic monotonic (s32), addrspace 1)
+    $vgpr6 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+...
+
+# Force insert S_WAIT_XCNT 0 for dependency in SMEM instruction even though
+# there is a pending VMEM dependency.
+
+---
+name: smem_xcnt_insertion_with_pending_vmem_event
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+    ; GCN-LABEL: name: smem_xcnt_insertion_with_pending_vmem_event
+    ; GCN: liveins: $sgpr0_sgpr1
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4)
+    ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 2, implicit $exec
+    ; GCN-NEXT: $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+    ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 4, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr4_vgpr5, 16, 0, implicit $exec
+    ; GCN-NEXT: S_WAIT_KMCNT 0
+    ; GCN-NEXT: $sgpr2 = S_ADD_I32 $sgpr0, 100, implicit-def $scc
+    ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 20, implicit $exec
+    $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4)
+    $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 2, implicit $exec
+    $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
+    $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr5 = V_MOV_B32_e32 4, implicit $exec
+    GLOBAL_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr4_vgpr5, 16, 0, implicit $exec
+    $sgpr2 = S_ADD_I32 $sgpr0, 100, implicit-def $scc
+    $vgpr0 = V_MOV_B32_e32 20, implicit $exec
+...
+
+# The second instruction in the flat_load group has a WAR dependency with a prior
+# memory opeartion (scratch_load instruction).
+
+---
+name: vmem_group_reg_dependency_with_prior_instruction
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr4, $vgpr5
+    ; GCN-LABEL: name: vmem_group_reg_dependency_with_prior_instruction
+    ; GCN: liveins: $vgpr4, $vgpr5
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr4, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; GCN-NEXT: S_WAIT_LOADCNT 0
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec
+    ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
+    ; GCN-NEXT: $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
+    ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr0 = SCRATCH_LOAD_DWORD $vgpr4, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec
+    $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
+    $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
+    $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+...
+
+# Two instructions inside the load group have dependencies with prior instructions.
+
+---
+name: multiple_xcnt_insertion_in_group
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr3, $vgpr4, $vgpr5
+    ; GCN-LABEL: name: multiple_xcnt_insertion_in_group
+    ; GCN: liveins: $vgpr3, $vgpr4, $vgpr5
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr4, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; GCN-NEXT: S_WAIT_LOADCNT 0
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec
+    ; GCN-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD $vgpr3, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
+    ; GCN-NEXT: $vgpr7 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
+    ; GCN-NEXT: $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
+    ; GCN-NEXT: $vgpr5 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
+    ; GCN-NEXT: $vgpr8 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr0 = SCRATCH_LOAD_DWORD $vgpr4, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec
+    $vgpr2 = SCRATCH_LOAD_DWORD $vgpr3, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr6 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr7 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
+    $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
+    $vgpr5 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
+    $vgpr8 = V_MOV_B32_e32 1, implicit $exec
+...
+
+---
+name: xcnt_event_post_load_group
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr10
+    ; GCN-LABEL: name: xcnt_event_post_load_group
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr10
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; GCN-NEXT: S_WAIT_LOADCNT 0
+    ; GCN-NEXT: $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GCN-NEXT: $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 8, 0, implicit $exec
+    ; GCN-NEXT: $vgpr6_vgpr7 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 16, 0, implicit $exec
+    ; GCN-NEXT: $vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 24, 0, implicit $exec
+    ; GCN-NEXT: S_WAIT_LOADCNT 3
+    ; GCN-NEXT: GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+    ; GCN-NEXT: $vgpr11 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec
+    ; GCN-NEXT: S_WAIT_LOADCNT 2
+    ; GCN-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
+    ; GCN-NEXT: S_WAIT_XCNT 1
+    ; GCN-NEXT: $vgpr2 = V_LSHLREV_B32_e64 16, $vgpr4, implicit $exec
+    $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
+    $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 8, 0, implicit $exec
+    $vgpr6_vgpr7 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 16, 0, implicit $exec
+    $vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 24, 0, implicit $exec
+    GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+    $vgpr11 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr3 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr6 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr2 = V_LSHLREV_B32_e64 16, $vgpr4, implicit $exec
+...
+
+# The three V_MOV_B32 instructions waiting outside the group needs appropriate wait_xcnt
+# insertion as their dst registers have dependencies with instructions inside the group.
+
+---
+name: xcnt_event_post_store_group
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+    ; GCN-LABEL: name: xcnt_event_post_store_group
+    ; GCN: liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; GCN-NEXT: S_WAIT_LOADCNT 0
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr7, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr8, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr9, 0, 0, implicit $exec
+    ; GCN-NEXT: $vgpr11 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+    ; GCN-NEXT: S_WAIT_XCNT 8
+    ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec
+    ; GCN-NEXT: S_WAIT_XCNT 6
+    ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+    ; GCN-NEXT: S_WAIT_XCNT 4
+    ; GCN-NEXT: $vgpr7 = V_MOV_B32_e32 2, implicit $exec
+    ; GCN-NEXT: S_WAIT_LOADCNT 0
+    ; GCN-NEXT: $vgpr11 = V_LSHLREV_B32_e64 16, $vgpr10, implicit $exec
+    $vgpr0 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr7, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr8, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr9, 0, 0, implicit $exec
+    $vgpr11 = SCRATCH_LOAD_DWORD $vgpr10, 0, 0, implicit $exec, implicit $flat_scr
+    GLOBAL_STORE_DWORD killed $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr7 = V_MOV_B32_e32 2, implicit $exec
+    $vgpr11 = V_LSHLREV_B32_e64 16, $vgpr10, implicit $exec
+...
+
+# This test captures the case that interleaving load store operations form separate groups.
+# The registers in V_MOV_B32 are all have dependency with these independent groups and
+# should have the wait_xcnt insertion with appropriate wait values.
+
+---
+name: load_store_switching
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GCN-LABEL: name: load_store_switching
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD $vgpr5, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: $vgpr1 = V_LSHLREV_B32_e64 16, $vgpr1, implicit $exec
+    ; GCN-NEXT: S_WAIT_LOADCNT 0
+    ; GCN-NEXT: $vgpr7 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, $vgpr4, 0, 0, implicit $exec
+    ; GCN-NEXT: $vgpr8 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD killed $vgpr2_vgpr3, $vgpr5, 0, 0, implicit $exec
+    ; GCN-NEXT: S_WAIT_LOADCNT 1
+    ; GCN-NEXT: $vgpr7 = V_MOV_B32_e32 0, implicit $exec
+    ; GCN-NEXT: S_WAIT_XCNT 2
+    ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+    ; GCN-NEXT: S_WAIT_LOADCNT 0
+    ; GCN-NEXT: $vgpr8 = V_MOV_B32_e32 2, implicit $exec
+    ; GCN-NEXT: S_WAIT_XCNT 0
+    ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 3, implicit $exec
+    $vgpr0 = SCRATCH_LOAD_DWORD $vgpr5, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr1 = V_LSHLREV_B32_e64 16, $vgpr1, implicit $exec
+    $vgpr7 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr2_vgpr3, $vgpr4, 0, 0, implicit $exec
+    $vgpr8 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec
+    GLOBAL_STORE_DWORD killed $vgpr2_vgpr3, $vgpr5, 0, 0, implicit $exec
+    $vgpr7 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr8 = V_MOV_B32_e32 2, implicit $exec
+    $vgpr5 = V_MOV_B32_e32 3, implicit $exec
+...
+
+# V_DUAL_MOV is a single instruction and should emit required xcnt
+# if the destination registers have any memory-op dependency.
+
+---
+name: dual_mov
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+     liveins: $sgpr0, $sgpr1, $vgpr1
+    ; GCN-LABEL: name: dual_mov
+    ; GCN: liveins: $sgpr0, $sgpr1, $vgpr1
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD $vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: S_WAIT_LOADCNT 0
+    ; GCN-NEXT: $vgpr1, $vgpr2 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 killed $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+    ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+     $vgpr2 = SCRATCH_LOAD_DWORD $vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+     $vgpr1, $vgpr2 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 killed $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+     $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+...
+
+# No xcnt wait insertion for DS load/store operations.
+
+---
+name: ds_load_store
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    ; GCN-LABEL: name: ds_load_store
+    ; GCN: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; GCN-NEXT: $vgpr0 = DS_READ_B32_gfx9 killed $vgpr1, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) undef`, addrspace 3)
+    ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 2, implicit $exec
+    ; GCN-NEXT: S_WAIT_DSCNT 0
+    ; GCN-NEXT: DS_WRITE_B32_gfx9 killed $vgpr0, killed $vgpr1, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(3) undef`, addrspace 3)
+    ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 20, implicit $exec
+     $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+     $vgpr0 = DS_READ_B32_gfx9 killed $vgpr1, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(3)* undef`)
+     $vgpr1 = V_MOV_B32_e32 2, implicit $exec
+     DS_WRITE_B32_gfx9 killed $vgpr0, killed $vgpr1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(3)* undef`)
+     $vgpr0 = V_MOV_B32_e32 20, implicit $exec
+...
+
+---
+name: xcnt_max
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
+    ; GCN-LABEL: name: xcnt_max
+    ; GCN: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: S_WAIT_XCNT 62
+    ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr3, 0, 0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+...


        


More information about the llvm-commits mailing list