[llvm] e4ebfb5 - [MCA] Adding an AMDGPUCustomBehaviour implementation.

Tue Aug 24 13:34:34 PDT 2021

Author: Patrick Holland
Date: 2021-08-24T13:33:58-07:00
New Revision: e4ebfb5786a1fbd48c67c5e1fe7b295241fe7010

URL: https://github.com/llvm/llvm-project/commit/e4ebfb5786a1fbd48c67c5e1fe7b295241fe7010
DIFF: https://github.com/llvm/llvm-project/commit/e4ebfb5786a1fbd48c67c5e1fe7b295241fe7010.diff

LOG: [MCA] Adding an AMDGPUCustomBehaviour implementation.

This implementation allows mca to model the desired behaviour of the s_waitcnt
instruction. This patch also adds the RetireOOO flag to the AMDGPU instructions
within the scheduling model. This flag is only used by mca and allows
instructions to finish out-of-order which helps mca's simulations more closely
model the actual device.

Differential Revision: https://reviews.llvm.org/D104730

Added: 
    llvm/test/tools/llvm-mca/AMDGPU/gfx9-retireooo.s

Modified: 
    llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
    llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
    llvm/lib/Target/AMDGPU/SISchedule.td
    llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
index 3643b777db59..220f99f7f436 100644

--- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
+++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
@@ -21,17 +21,312 @@
 namespace llvm {
 namespace mca {
 
+void AMDGPUInstrPostProcess::postProcessInstruction(
+    std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
+  switch (MCI.getOpcode()) {
+  case AMDGPU::S_WAITCNT:
+  case AMDGPU::S_WAITCNT_EXPCNT:
+  case AMDGPU::S_WAITCNT_LGKMCNT:
+  case AMDGPU::S_WAITCNT_VMCNT:
+  case AMDGPU::S_WAITCNT_VSCNT:
+  case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
+  case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
+  case AMDGPU::S_WAITCNT_VMCNT_gfx10:
+  case AMDGPU::S_WAITCNT_VSCNT_gfx10:
+  case AMDGPU::S_WAITCNT_gfx10:
+  case AMDGPU::S_WAITCNT_gfx6_gfx7:
+  case AMDGPU::S_WAITCNT_vi:
+    return processWaitCnt(Inst, MCI);
+  }
+}
+
+// s_waitcnt instructions encode important information as immediate operands
+// which are lost during the MCInst -> mca::Instruction lowering.
+void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst,
+                                            const MCInst &MCI) {
+  for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) {
+    MCAOperand Op;
+    const MCOperand &MCOp = MCI.getOperand(Idx);
+    if (MCOp.isReg()) {
+      Op = MCAOperand::createReg(MCOp.getReg());
+    } else if (MCOp.isImm()) {
+      Op = MCAOperand::createImm(MCOp.getImm());
+    }
+    Op.setIndex(Idx);
+    Inst->addOperand(Op);
+  }
+}
+
 AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
                                              const mca::SourceMgr &SrcMgr,
                                              const MCInstrInfo &MCII)
-    : CustomBehaviour(STI, SrcMgr, MCII) {}
+    : CustomBehaviour(STI, SrcMgr, MCII) {
+  generateWaitCntInfo();
+}
+
+unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
+                                                  const InstRef &IR) {
+  const Instruction &Inst = *IR.getInstruction();
+  unsigned Opcode = Inst.getOpcode();
+
+  // llvm-mca is generally run on fully compiled assembly so we wouldn't see any
+  // pseudo instructions here. However, there are plans for the future to make
+  // it possible to use mca within backend passes. As such, I have left the
+  // pseudo version of s_waitcnt within this switch statement.
+  switch (Opcode) {
+  default:
+    return 0;
+  case AMDGPU::S_WAITCNT: // This instruction
+  case AMDGPU::S_WAITCNT_EXPCNT:
+  case AMDGPU::S_WAITCNT_LGKMCNT:
+  case AMDGPU::S_WAITCNT_VMCNT:
+  case AMDGPU::S_WAITCNT_VSCNT: // to this instruction are all pseudo.
+  case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
+  case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
+  case AMDGPU::S_WAITCNT_VMCNT_gfx10:
+  case AMDGPU::S_WAITCNT_VSCNT_gfx10:
+  case AMDGPU::S_WAITCNT_gfx10:
+  case AMDGPU::S_WAITCNT_gfx6_gfx7:
+  case AMDGPU::S_WAITCNT_vi:
+    // s_endpgm also behaves as if there is an implicit
+    // s_waitcnt 0, but I'm not sure if it would be appropriate
+    // to model this in llvm-mca based on how the iterations work
+    // while simulating the pipeline over and over.
+    return handleWaitCnt(IssuedInst, IR);
+  }
 
-unsigned
-AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<mca::InstRef> IssuedInst,
-                                         const mca::InstRef &IR) {
   return 0;
 }
 
+unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst,
+                                              const InstRef &IR) {
+  // Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr.
+  // I do not know how that instruction works so I did not attempt to model it.
+  // set the max values to begin
+  unsigned Vmcnt = 63;
+  unsigned Expcnt = 7;
+  unsigned Lgkmcnt = 31;
+  unsigned Vscnt = 63;
+  unsigned CurrVmcnt = 0;
+  unsigned CurrExpcnt = 0;
+  unsigned CurrLgkmcnt = 0;
+  unsigned CurrVscnt = 0;
+  unsigned CyclesToWaitVm = ~0U;
+  unsigned CyclesToWaitExp = ~0U;
+  unsigned CyclesToWaitLgkm = ~0U;
+  unsigned CyclesToWaitVs = ~0U;
+
+  computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt);
+
+  // We will now look at each of the currently executing instructions
+  // to find out if this wait instruction still needs to wait.
+  for (auto I = IssuedInst.begin(), E = IssuedInst.end(); I != E; I++) {
+    const InstRef &PrevIR = *I;
+    const Instruction &PrevInst = *PrevIR.getInstruction();
+    const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size();
+    const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex];
+    const int CyclesLeft = PrevInst.getCyclesLeft();
+    assert(CyclesLeft != UNKNOWN_CYCLES &&
+           "We should know how many cycles are left for this instruction");
+    if (PrevInstWaitInfo.VmCnt) {
+      CurrVmcnt++;
+      if ((unsigned)CyclesLeft < CyclesToWaitVm)
+        CyclesToWaitVm = CyclesLeft;
+    }
+    if (PrevInstWaitInfo.ExpCnt) {
+      CurrExpcnt++;
+      if ((unsigned)CyclesLeft < CyclesToWaitExp)
+        CyclesToWaitExp = CyclesLeft;
+    }
+    if (PrevInstWaitInfo.LgkmCnt) {
+      CurrLgkmcnt++;
+      if ((unsigned)CyclesLeft < CyclesToWaitLgkm)
+        CyclesToWaitLgkm = CyclesLeft;
+    }
+    if (PrevInstWaitInfo.VsCnt) {
+      CurrVscnt++;
+      if ((unsigned)CyclesLeft < CyclesToWaitVs)
+        CyclesToWaitVs = CyclesLeft;
+    }
+  }
+
+  unsigned CyclesToWait = ~0U;
+  if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait)
+    CyclesToWait = CyclesToWaitVm;
+  if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait)
+    CyclesToWait = CyclesToWaitExp;
+  if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait)
+    CyclesToWait = CyclesToWaitLgkm;
+  if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait)
+    CyclesToWait = CyclesToWaitVs;
+
+  // We may underestimate how many cycles we need to wait, but this
+  // isn't a big deal. Our return value is just how many cycles until
+  // this function gets run again. So as long as we don't overestimate
+  // the wait time, we'll still end up stalling at this instruction
+  // for the correct number of cycles.
+
+  if (CyclesToWait == ~0U)
+    return 0;
+  return CyclesToWait;
+}
+
+void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt,
+                                           unsigned &Expcnt, unsigned &Lgkmcnt,
+                                           unsigned &Vscnt) {
+  AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
+  const Instruction &Inst = *IR.getInstruction();
+  unsigned Opcode = Inst.getOpcode();
+
+  switch (Opcode) {
+  case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
+  case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
+  case AMDGPU::S_WAITCNT_VMCNT_gfx10:
+  case AMDGPU::S_WAITCNT_VSCNT_gfx10: {
+    // Should probably be checking for nullptr
+    // here, but I'm not sure how I should handle the case
+    // where we see a nullptr.
+    const MCAOperand *OpReg = Inst.getOperand(0);
+    const MCAOperand *OpImm = Inst.getOperand(1);
+    assert(OpReg && OpReg->isReg() && "First operand should be a register.");
+    assert(OpImm && OpImm->isImm() && "Second operand should be an immediate.");
+    if (OpReg->getReg() != AMDGPU::SGPR_NULL) {
+      // Instruction is using a real register.
+      // Since we can't know what value this register will have,
+      // we can't compute what the value of this wait should be.
+      WithColor::warning() << "The register component of "
+                           << MCII.getName(Opcode) << " will be completely "
+                           << "ignored. So the wait may not be accurate.\n";
+    }
+    switch (Opcode) {
+    // Redundant switch so I don't have to repeat the code above
+    // for each case. There are more clever ways to avoid this
+    // extra switch and anyone can feel free to implement one of them.
+    case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
+      Expcnt = OpImm->getImm();
+      break;
+    case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
+      Lgkmcnt = OpImm->getImm();
+      break;
+    case AMDGPU::S_WAITCNT_VMCNT_gfx10:
+      Vmcnt = OpImm->getImm();
+      break;
+    case AMDGPU::S_WAITCNT_VSCNT_gfx10:
+      Vscnt = OpImm->getImm();
+      break;
+    }
+    return;
+  }
+  case AMDGPU::S_WAITCNT_gfx10:
+  case AMDGPU::S_WAITCNT_gfx6_gfx7:
+  case AMDGPU::S_WAITCNT_vi:
+    unsigned WaitCnt = Inst.getOperand(0)->getImm();
+    AMDGPU::decodeWaitcnt(IV, WaitCnt, Vmcnt, Expcnt, Lgkmcnt);
+    return;
+  }
+}
+
+void AMDGPUCustomBehaviour::generateWaitCntInfo() {
+  // The core logic from this function is taken from
+  // SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions
+  // that are being looked at are in the MachineInstr format, whereas we have
+  // access to the MCInst format. The side effects of this are that we can't use
+  // the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst)
+  // functions. Therefore, we conservatively assume that these functions will
+  // return true. This may cause a few instructions to be incorrectly tagged
+  // with an extra CNT. However, these are instructions that do interact with at
+  // least one CNT so giving them an extra CNT shouldn't cause issues in most
+  // scenarios.
+  AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
+  InstrWaitCntInfo.resize(SrcMgr.size());
+
+  int Index = 0;
+  for (auto I = SrcMgr.begin(), E = SrcMgr.end(); I != E; ++I, ++Index) {
+    const std::unique_ptr<Instruction> &Inst = *I;
+    unsigned Opcode = Inst->getOpcode();
+    const MCInstrDesc &MCID = MCII.get(Opcode);
+    if ((MCID.TSFlags & SIInstrFlags::DS) &&
+        (MCID.TSFlags & SIInstrFlags::LGKM_CNT)) {
+      InstrWaitCntInfo[Index].LgkmCnt = true;
+      if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, AMDGPU::OpName::gds))
+        InstrWaitCntInfo[Index].ExpCnt = true;
+    } else if (MCID.TSFlags & SIInstrFlags::FLAT) {
+      // We conservatively assume that mayAccessVMEMThroughFlat(Inst)
+      // and mayAccessLDSThroughFlat(Inst) would both return true for this
+      // instruction. We have to do this because those functions use
+      // information about the memory operands that we don't have access to.
+      InstrWaitCntInfo[Index].LgkmCnt = true;
+      if (!STI.hasFeature(AMDGPU::FeatureVscnt))
+        InstrWaitCntInfo[Index].VmCnt = true;
+      else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet))
+        InstrWaitCntInfo[Index].VmCnt = true;
+      else
+        InstrWaitCntInfo[Index].VsCnt = true;
+    } else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opcode)) {
+      if (!STI.hasFeature(AMDGPU::FeatureVscnt))
+        InstrWaitCntInfo[Index].VmCnt = true;
+      else if ((MCID.mayLoad() &&
+                !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) ||
+               ((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() &&
+                !MCID.mayStore()))
+        InstrWaitCntInfo[Index].VmCnt = true;
+      else if (MCID.mayStore())
+        InstrWaitCntInfo[Index].VsCnt = true;
+
+      // (IV.Major < 7) is meant to represent
+      // GCNTarget.vmemWriteNeedsExpWaitcnt()
+      // which is defined as
+      // { return getGeneration() < SEA_ISLANDS; }
+      if (IV.Major < 7 &&
+          (MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet)))
+        InstrWaitCntInfo[Index].ExpCnt = true;
+    } else if (MCID.TSFlags & SIInstrFlags::SMRD) {
+      InstrWaitCntInfo[Index].LgkmCnt = true;
+    } else if (MCID.TSFlags & SIInstrFlags::EXP) {
+      InstrWaitCntInfo[Index].ExpCnt = true;
+    } else {
+      switch (Opcode) {
+      case AMDGPU::S_SENDMSG:
+      case AMDGPU::S_SENDMSGHALT:
+      case AMDGPU::S_MEMTIME:
+      case AMDGPU::S_MEMREALTIME:
+        InstrWaitCntInfo[Index].LgkmCnt = true;
+        break;
+      }
+    }
+  }
+}
+
+// taken from SIInstrInfo::isVMEM()
+bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) {
+  return MCID.TSFlags & SIInstrFlags::MUBUF ||
+         MCID.TSFlags & SIInstrFlags::MTBUF ||
+         MCID.TSFlags & SIInstrFlags::MIMG;
+}
+
+// taken from SIInstrInfo::hasModifiersSet()
+bool AMDGPUCustomBehaviour::hasModifiersSet(
+    const std::unique_ptr<Instruction> &Inst, unsigned OpName) const {
+  int Idx = AMDGPU::getNamedOperandIdx(Inst->getOpcode(), OpName);
+  if (Idx == -1)
+    return false;
+
+  const MCAOperand *Op = Inst->getOperand(Idx);
+  if (Op == nullptr || !Op->isImm() || !Op->getImm())
+    return false;
+
+  return true;
+}
+
+// taken from SIInstrInfo::isAlwaysGDS()
+bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const {
+  return Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::DS_GWS_INIT ||
+         Opcode == AMDGPU::DS_GWS_SEMA_V || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
+         Opcode == AMDGPU::DS_GWS_SEMA_P ||
+         Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL ||
+         Opcode == AMDGPU::DS_GWS_BARRIER;
+}
+
 } // namespace mca
 } // namespace llvm
 

diff  --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
index daefad28699c..728c5455ff49 100644
--- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
+++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
@@ -25,23 +25,68 @@ namespace llvm {
 namespace mca {
 
 class AMDGPUInstrPostProcess : public InstrPostProcess {
+  void processWaitCnt(std::unique_ptr<Instruction> &Inst, const MCInst &MCI);
+
 public:
   AMDGPUInstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
       : InstrPostProcess(STI, MCII) {}
 
   ~AMDGPUInstrPostProcess() {}
 
-  void postProcessInstruction(std::unique_ptr<mca::Instruction> &Inst,
-                              const MCInst &MCI) override {}
+  void postProcessInstruction(std::unique_ptr<Instruction> &Inst,
+                              const MCInst &MCI) override;
+};
+
+struct WaitCntInfo {
+  bool VmCnt = false;
+  bool ExpCnt = false;
+  bool LgkmCnt = false;
+  bool VsCnt = false;
 };
 
 class AMDGPUCustomBehaviour : public CustomBehaviour {
+  /// Whenever MCA would like to dispatch an s_waitcnt instructions,
+  /// we must check all the instruction that are still executing to see if
+  /// they modify the same CNT as we need to wait for. This vector
+  /// gets built in the constructor and contains 1 WaitCntInfo struct
+  /// for each instruction within the SrcManager. Each element
+  /// tells us which CNTs that instruction may interact with.
+  /// We conservatively assume some instructions interact with more
+  /// CNTs than they do in reality, so we will occasionally wait
+  /// longer than necessary, but we shouldn't ever wait for shorter.
+  std::vector<WaitCntInfo> InstrWaitCntInfo;
+
+  /// This method gets called from the constructor and is
+  /// where we setup the InstrWaitCntInfo vector.
+  /// The core logic for determining which CNTs an instruction
+  /// interacts with is taken from SIInsertWaitcnts::updateEventWaitcntAfter().
+  /// Unfortunately, some of the logic from that function is not avalable to us
+  /// in this scope so we conservatively end up assuming that some
+  /// instructions interact with more CNTs than they do in reality.
+  void generateWaitCntInfo();
+  /// Helper function used in generateWaitCntInfo()
+  bool hasModifiersSet(const std::unique_ptr<Instruction> &Inst,
+                       unsigned OpName) const;
+  /// Helper function used in generateWaitCntInfo()
+  bool isAlwaysGDS(uint16_t Opcode) const;
+  /// Helper function used in generateWaitCntInfo()
+  bool isVMEM(const MCInstrDesc &MCID);
+  /// This method gets called from checkCustomHazard when mca is attempting to
+  /// dispatch an s_waitcnt instruction (or one of its variants). The method
+  /// looks at each of the instructions that are still executing in the pipeline
+  /// to determine if the waitcnt should force a wait.
+  unsigned handleWaitCnt(ArrayRef<InstRef> IssuedInst, const InstRef &IR);
+  /// Based on the type of s_waitcnt instruction we are looking at, and what its
+  /// operands are, this method will set the values for each of the cnt
+  /// references provided as arguments.
+  void computeWaitCnt(const InstRef &IR, unsigned &Vmcnt, unsigned &Expcnt,
+                      unsigned &Lgkmcnt, unsigned &Vscnt);
+
 public:
   AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
                         const mca::SourceMgr &SrcMgr, const MCInstrInfo &MCII);
 
   ~AMDGPUCustomBehaviour() {}
-
   /// This method is used to determine if an instruction
   /// should be allowed to be dispatched. The return value is
   /// how many cycles until the instruction can be dispatched.
@@ -49,10 +94,9 @@ class AMDGPUCustomBehaviour : public CustomBehaviour {
   /// register and hardware dependencies so this method should only
   /// implement custom behaviour and dependencies that are not picked up
   /// by MCA naturally.
-  unsigned checkCustomHazard(ArrayRef<mca::InstRef> IssuedInst,
-                             const mca::InstRef &IR) override;
+  unsigned checkCustomHazard(ArrayRef<InstRef> IssuedInst,
+                             const InstRef &IR) override;
 };
-
 } // namespace mca
 } // namespace llvm
 

diff  --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index b24c061af7ab..0792b303b830 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -137,6 +137,7 @@ def MIReadVGPR : SchedReadVariant<[
 // The latency values are 1 / (operations / cycle) / 4.
 multiclass SICommonWriteRes {
 
+  let RetireOOO = 1 in { // llvm-mca specific flag
   def : HWWriteRes<WriteBranch,  [HWBranch], 8>;
   def : HWWriteRes<WriteExport,  [HWExport], 4>;
   def : HWWriteRes<WriteLDS,     [HWLGKM],   5>; // Can be between 2 and 64
@@ -159,6 +160,7 @@ multiclass SICommonWriteRes {
   def : HWWriteRes<Write8PassMAI,  [HWXDL], 8>;
   let ResourceCycles = [16] in
   def : HWWriteRes<Write16PassMAI, [HWXDL], 16>;
+  } // End RetireOOO = 1
 
   def : ReadAdvance<MIVGPRRead, -2>;
 
@@ -182,6 +184,7 @@ let SchedModel = SIFullSpeedModel in {
 
 defm : SICommonWriteRes;
 
+let RetireOOO = 1 in { // llvm-mca specific flag
 def : HWVALUWriteRes<Write64Bit,       2>;
 def : HWVALUWriteRes<WriteIntMul,      4>;
 def : HWVALUWriteRes<WriteFloatFMA,    1>;
@@ -189,6 +192,7 @@ def : HWVALUWriteRes<WriteDouble,      4>;
 def : HWVALUWriteRes<WriteDoubleAdd,   2>;
 def : HWVALUWriteRes<WriteDoubleCvt,   4>;
 def : HWVALUWriteRes<WriteTrans64,     4>;
+} // End RetireOOO = 1
 
 def : InstRW<[WriteCopy], (instrs COPY)>;
 
@@ -198,6 +202,7 @@ let SchedModel = SIQuarterSpeedModel in {
 
 defm : SICommonWriteRes;
 
+let RetireOOO = 1 in { // llvm-mca specific flag
 def : HWVALUWriteRes<Write64Bit,       2>;
 def : HWVALUWriteRes<WriteIntMul,      4>;
 def : HWVALUWriteRes<WriteFloatFMA,    16>;
@@ -205,6 +210,7 @@ def : HWVALUWriteRes<WriteDouble,      16>;
 def : HWVALUWriteRes<WriteDoubleAdd,    8>;
 def : HWVALUWriteRes<WriteDoubleCvt,    4>;
 def : HWVALUWriteRes<WriteTrans64,     16>;
+} // End RetireOOO = 1
 
 def : InstRW<[WriteCopy], (instrs COPY)>;
 def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>;
@@ -218,6 +224,7 @@ let SchedModel = SIDPFullSpeedModel in {
 
 defm : SICommonWriteRes;
 
+let RetireOOO = 1 in { // llvm-mca specific flag
 def : HWVALUWriteRes<WriteFloatFMA,    1>;
 def : HWVALUWriteRes<WriteDouble,      1>;
 def : HWVALUWriteRes<WriteDoubleAdd,   1>;
@@ -225,6 +232,7 @@ def : HWVALUWriteRes<WriteDoubleCvt,   1>;
 def : HWVALUWriteRes<WriteTrans64,     4>;
 def : HWVALUWriteRes<WriteIntMul,      1>;
 def : HWVALUWriteRes<Write64Bit,       1>;
+} // End RetireOOO = 1
 
 def : InstRW<[WriteCopy], (instrs COPY)>;
 def : InstRW<[Write64Bit], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>;
@@ -240,6 +248,7 @@ let SchedModel = GFX10SpeedModel in {
 
 // The latency values are 1 / (operations / cycle).
 // Add 1 stall cycle for VGPR read.
+let RetireOOO = 1 in { // llvm-mca specific flag
 def : HWWriteRes<Write32Bit,         [HWVALU, HWRC],   5>;
 def : HWWriteRes<WriteFloatCvt,      [HWVALU, HWRC],   5>;
 def : HWWriteRes<Write64Bit,         [HWVALU, HWRC],   6>;
@@ -259,6 +268,7 @@ def : HWWriteRes<WriteSALU,          [HWSALU,   HWRC], 2>;
 def : HWWriteRes<WriteSMEM,          [HWLGKM,   HWRC], 20>;
 def : HWWriteRes<WriteVMEM,          [HWVMEM,   HWRC], 320>;
 def : HWWriteRes<WriteBarrier,       [HWBranch],       2000>;
+} // End RetireOOO = 1
 
 def : InstRW<[WriteCopy], (instrs COPY)>;
 

diff  --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s
index 0ffdad05cfa6..00b429ef6d67 100644
--- a/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s
+++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s
@@ -41,12 +41,12 @@ v_sqrt_f64 v[4:5], v[4:5]
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      28
-# CHECK-NEXT: Total Cycles:      224
+# CHECK-NEXT: Total Cycles:      205
 # CHECK-NEXT: Total uOps:        29
 
 # CHECK:      Dispatch Width:    1
-# CHECK-NEXT: uOps Per Cycle:    0.13
-# CHECK-NEXT: IPC:               0.13
+# CHECK-NEXT: uOps Per Cycle:    0.14
+# CHECK-NEXT: IPC:               0.14
 # CHECK-NEXT: Block RThroughput: 29.0
 
 # CHECK:      Instruction Info:
@@ -133,37 +133,37 @@ v_sqrt_f64 v[4:5], v[4:5]
 # CHECK-NEXT:  -      -      -     1.00    -     1.00   1.00    -     v_sqrt_f64_e32 v[4:5], v[4:5]
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeeeeeeeeeeeeeeE  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_cvt_i32_f64_e32 v0, v[0:1]
-# CHECK-NEXT: [0,1]     .DeeeeeeeeeeeeeeeeeeeeeE .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_cvt_f64_i32_e32 v[2:3], v2
-# CHECK-NEXT: [0,2]     . DeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_cvt_f32_f64_e32 v4, v[4:5]
-# CHECK-NEXT: [0,3]     .  DeeeeeeeeeeeeeeeeeeeeeE    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_cvt_f64_f32_e32 v[6:7], v6
-# CHECK-NEXT: [0,4]     .   DeeeeeeeeeeeeeeeeeeeeeE   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_cvt_u32_f64_e32 v8, v[8:9]
-# CHECK-NEXT: [0,5]     .    DeeeeeeeeeeeeeeeeeeeeeE  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_cvt_f64_u32_e32 v[10:11], v10
-# CHECK-NEXT: [0,6]     .    .    .    .    . DeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_frexp_exp_i32_f64_e32 v0, v[0:1]
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeeeeeeeeeeeeeeeeeeeeE    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_frexp_mant_f64_e32 v[2:3], v[2:3]
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeeeeeeeeeeeeeeeeeeE   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_fract_f64_e32 v[4:5], v[4:5]
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .   DeeeeeeeeeeeeeeeeeeeeeE   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_trunc_f64_e32 v[0:1], v[0:1]
-# CHECK-NEXT: [0,10]    .    .    .    .    .    .    .    .    .    DeeeeeeeeeeeeeeeeeeeeeE  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_ceil_f64_e32 v[2:3], v[2:3]
-# CHECK-NEXT: [0,11]    .    .    .    .    .    .    .    .    .    .DeeeeeeeeeeeeeeeeeeeeeE .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_rndne_f64_e32 v[4:5], v[4:5]
-# CHECK-NEXT: [0,12]    .    .    .    .    .    .    .    .    .    . DeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_floor_f64_e32 v[6:7], v[6:7]
-# CHECK-NEXT: [0,13]    .    .    .    .    .    .    .    .    .    .    .    .    .    .DeeeeeeeeeeeeeeeeeeeeeE .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_fma_f64 v[0:1], v[0:1], v[0:1], v[0:1]
-# CHECK-NEXT: [0,14]    .    .    .    .    .    .    .    .    .    .    .    .    .    . DeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_add_f64 v[2:3], v[2:3], v[2:3]
-# CHECK-NEXT: [0,15]    .    .    .    .    .    .    .    .    .    .    .    .    .    .  DeeeeeeeeeeeeeeeeeeeeeE    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mul_f64 v[4:5], v[4:5], v[4:5]
-# CHECK-NEXT: [0,16]    .    .    .    .    .    .    .    .    .    .    .    .    .    .   DeeeeeeeeeeeeeeeeeeeeeE   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_min_f64 v[6:7], v[6:7], v[6:7]
-# CHECK-NEXT: [0,17]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    DeeeeeeeeeeeeeeeeeeeeeE  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_max_f64 v[8:9], v[8:9], v[8:9]
-# CHECK-NEXT: [0,18]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  DeeeeeeeeeeeeeeeeeeeeeE    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_div_fmas_f64 v[0:1], v[0:1], v[0:1], v[0:1]
-# CHECK-NEXT: [0,19]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    DeeeeeeeeeeeeeeeeeeeeeE  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_div_fixup_f64 v[0:1], v[0:1], v[0:1], v[0:1]
-# CHECK-NEXT: [0,20]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    . DeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_ldexp_f64 v[2:3], v[2:3], v0
-# CHECK-NEXT: [0,21]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    . DeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .  .   v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1]
-# CHECK-NEXT: [0,22]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   DeeeeeeeeeeeeeeeeeeeeeE   .    .    .    .    .  .   v_trig_preop_f64 v[2:3], v[2:3], v0
-# CHECK-NEXT: [0,23]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    DeeeeeeeeeeeeeeeeeeeeeE  .    .    .    .    .  .   v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[0:1]
-# CHECK-NEXT: [0,24]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .DeeeeeeeeeeeeeeeeeeeeeE .  .   v_cmp_class_f64_e64 vcc_lo, v[2:3], s0
-# CHECK-NEXT: [0,25]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    . DeeeeeeeeeeeeeeeeeeeeeeeE .   v_rcp_f64_e32 v[0:1], v[0:1]
-# CHECK-NEXT: [0,26]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  DeeeeeeeeeeeeeeeeeeeeeeeE.   v_rsq_f64_e32 v[2:3], v[2:3]
-# CHECK-NEXT: [0,27]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   DeeeeeeeeeeeeeeeeeeeeeeeE   v_sqrt_f64_e32 v[4:5], v[4:5]
+# CHECK-NEXT:                     0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          01234
+
+# CHECK:      [0,0]     DeeeeeeeeeeeeeeeeeeeeeE  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_cvt_i32_f64_e32 v0, v[0:1]
+# CHECK-NEXT: [0,1]     .DeeeeeeeeeeeeeeeeeeeeeE .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_cvt_f64_i32_e32 v[2:3], v2
+# CHECK-NEXT: [0,2]     . DeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_cvt_f32_f64_e32 v4, v[4:5]
+# CHECK-NEXT: [0,3]     .  DeeeeeeeeeeeeeeeeeeeeeE    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_cvt_f64_f32_e32 v[6:7], v6
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeeeeeeeeeeeeeeE   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_cvt_u32_f64_e32 v8, v[8:9]
+# CHECK-NEXT: [0,5]     .    DeeeeeeeeeeeeeeeeeeeeeE  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_cvt_f64_u32_e32 v[10:11], v10
+# CHECK-NEXT: [0,6]     .    .    .    .    . DeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_frexp_exp_i32_f64_e32 v0, v[0:1]
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeeeeeeeeeeeeeeeeeeeeE    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_frexp_mant_f64_e32 v[2:3], v[2:3]
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeeeeeeeeeeeeeeeeeeE   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_fract_f64_e32 v[4:5], v[4:5]
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .   DeeeeeeeeeeeeeeeeeeeeeE   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_trunc_f64_e32 v[0:1], v[0:1]
+# CHECK-NEXT: [0,10]    .    .    .    .    .    .    .    .    .    DeeeeeeeeeeeeeeeeeeeeeE  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_ceil_f64_e32 v[2:3], v[2:3]
+# CHECK-NEXT: [0,11]    .    .    .    .    .    .    .    .    .    .DeeeeeeeeeeeeeeeeeeeeeE .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_rndne_f64_e32 v[4:5], v[4:5]
+# CHECK-NEXT: [0,12]    .    .    .    .    .    .    .    .    .    . DeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_floor_f64_e32 v[6:7], v[6:7]
+# CHECK-NEXT: [0,13]    .    .    .    .    .    .    .    .    .    .    .    .    .    .DeeeeeeeeeeeeeeeeeeeeeE .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_fma_f64 v[0:1], v[0:1], v[0:1], v[0:1]
+# CHECK-NEXT: [0,14]    .    .    .    .    .    .    .    .    .    .    .    .    .    . DeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_add_f64 v[2:3], v[2:3], v[2:3]
+# CHECK-NEXT: [0,15]    .    .    .    .    .    .    .    .    .    .    .    .    .    .  DeeeeeeeeeeeeeeeeeeeeeE    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_mul_f64 v[4:5], v[4:5], v[4:5]
+# CHECK-NEXT: [0,16]    .    .    .    .    .    .    .    .    .    .    .    .    .    .   DeeeeeeeeeeeeeeeeeeeeeE   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_min_f64 v[6:7], v[6:7], v[6:7]
+# CHECK-NEXT: [0,17]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    DeeeeeeeeeeeeeeeeeeeeeE  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_max_f64 v[8:9], v[8:9], v[8:9]
+# CHECK-NEXT: [0,18]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  DeeeeeeeeeeeeeeeeeeeeeE    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_div_fmas_f64 v[0:1], v[0:1], v[0:1], v[0:1]
+# CHECK-NEXT: [0,19]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    DeeeeeeeeeeeeeeeeeeeeeE  .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_div_fixup_f64 v[0:1], v[0:1], v[0:1], v[0:1]
+# CHECK-NEXT: [0,20]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    . DeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .   .   v_ldexp_f64 v[2:3], v[2:3], v0
+# CHECK-NEXT: [0,21]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  DeeeeeeeeeeeeeeeeeeeeeE    .    .    .    .    .    .    .    .    .   .   v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1]
+# CHECK-NEXT: [0,22]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    DeeeeeeeeeeeeeeeeeeeeeE  .    .    .    .    .   .   v_trig_preop_f64 v[2:3], v[2:3], v0
+# CHECK-NEXT: [0,23]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .DeeeeeeeeeeeeeeeeeeeeeE .    .    .    .    .   .   v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[0:1]
+# CHECK-NEXT: [0,24]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    . DeeeeeeeeeeeeeeeeeeeeeE.   .   v_cmp_class_f64_e64 vcc_lo, v[2:3], s0
+# CHECK-NEXT: [0,25]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  DeeeeeeeeeeeeeeeeeeeeeeeE .   v_rcp_f64_e32 v[0:1], v[0:1]
+# CHECK-NEXT: [0,26]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   DeeeeeeeeeeeeeeeeeeeeeeeE.   v_rsq_f64_e32 v[2:3], v[2:3]
+# CHECK-NEXT: [0,27]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    DeeeeeeeeeeeeeeeeeeeeeeeE   v_sqrt_f64_e32 v[4:5], v[4:5]
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions

diff  --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx9-retireooo.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx9-retireooo.s
new file mode 100644
index 000000000000..939d3b062013
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx9-retireooo.s
@@ -0,0 +1,233 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=amdgcn -mcpu=gfx900 --timeline --iterations=1 --timeline-max-cycles=0 < %s | FileCheck %s
+
+s_load_dwordx2 s[2:3], s[0:1], 0x24
+s_load_dwordx2 s[0:1], s[0:1], 0x2c
+s_waitcnt lgkmcnt(0)
+v_mov_b32_e32 v0, s2
+v_mov_b32_e32 v1, s3
+flat_load_dword v2, v[0:1]
+flat_load_dword v3, v[0:1] offset:8
+flat_load_dword v4, v[0:1] offset:16
+flat_load_dword v5, v[0:1] offset:24
+v_mov_b32_e32 v0, s0
+v_mov_b32_e32 v1, s1
+v_mov_b32_e32 v6, s6
+v_mov_b32_e32 v7, s7
+v_mov_b32_e32 v8, s8
+v_mov_b32_e32 v9, s9
+v_mov_b32_e32 v10, s10
+v_mov_b32_e32 v11, s11
+v_mov_b32_e32 v12, s12
+v_mov_b32_e32 v13, s13
+v_mov_b32_e32 v14, s14
+v_mov_b32_e32 v15, s15
+v_mov_b32_e32 v16, s16
+v_mov_b32_e32 v17, s17
+v_mov_b32_e32 v18, s18
+v_mov_b32_e32 v19, s19
+v_mov_b32_e32 v20, s20
+v_mov_b32_e32 v21, s21
+v_mov_b32_e32 v22, s22
+v_mov_b32_e32 v23, s23
+v_mov_b32_e32 v24, s24
+v_mov_b32_e32 v25, s25
+v_mov_b32_e32 v26, s26
+v_mov_b32_e32 v27, s27
+v_mov_b32_e32 v28, s28
+v_mov_b32_e32 v29, s29
+s_waitcnt vmcnt(0) lgkmcnt(0)
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      36
+# CHECK-NEXT: Total Cycles:      331
+# CHECK-NEXT: Total uOps:        36
+
+# CHECK:      Dispatch Width:    1
+# CHECK-NEXT: uOps Per Cycle:    0.11
+# CHECK-NEXT: IPC:               0.11
+# CHECK-NEXT: Block RThroughput: 36.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     1.00    *                   s_load_dwordx2 s[2:3], s[0:1], 0x24
+# CHECK-NEXT:  1      5     1.00    *                   s_load_dwordx2 s[0:1], s[0:1], 0x2c
+# CHECK-NEXT:  1      1     1.00                  U     s_waitcnt lgkmcnt(0)
+# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v0, s2
+# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v1, s3
+# CHECK-NEXT:  1      80    1.00    *             U     flat_load_dword v2, v[0:1]
+# CHECK-NEXT:  1      80    1.00    *             U     flat_load_dword v3, v[0:1] offset:8
+# CHECK-NEXT:  1      80    1.00    *             U     flat_load_dword v4, v[0:1] offset:16
+# CHECK-NEXT:  1      80    1.00    *             U     flat_load_dword v5, v[0:1] offset:24
+# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v0, s0
+# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v1, s1
+# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v6, s6
+# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v7, s7
+# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v8, s8
+# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v9, s9
+# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v10, s10
+# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v11, s11
+# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v12, s12
+# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v13, s13
+# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v14, s14
+# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v15, s15
+# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v16, s16
+# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v17, s17
+# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v18, s18
+# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v19, s19
+# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v20, s20
+# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v21, s21
+# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v22, s22
+# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v23, s23
+# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v24, s24
+# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v25, s25
+# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v26, s26
+# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v27, s27
+# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v28, s28
+# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v29, s29
+# CHECK-NEXT:  1      1     1.00                  U     s_waitcnt vmcnt(0) lgkmcnt(0)
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - HWBranch
+# CHECK-NEXT: [1]   - HWExport
+# CHECK-NEXT: [2]   - HWLGKM
+# CHECK-NEXT: [3]   - HWSALU
+# CHECK-NEXT: [4]   - HWVALU
+# CHECK-NEXT: [5]   - HWVMEM
+# CHECK-NEXT: [6]   - HWXDL
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]
+# CHECK-NEXT:  -      -     2.00   2.00   28.00  4.00    -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -      -     1.00    -      -      -      -     s_load_dwordx2 s[2:3], s[0:1], 0x24
+# CHECK-NEXT:  -      -     1.00    -      -      -      -     s_load_dwordx2 s[0:1], s[0:1], 0x2c
+# CHECK-NEXT:  -      -      -     1.00    -      -      -     s_waitcnt lgkmcnt(0)
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v0, s2
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v1, s3
+# CHECK-NEXT:  -      -      -      -      -     1.00    -     flat_load_dword v2, v[0:1]
+# CHECK-NEXT:  -      -      -      -      -     1.00    -     flat_load_dword v3, v[0:1] offset:8
+# CHECK-NEXT:  -      -      -      -      -     1.00    -     flat_load_dword v4, v[0:1] offset:16
+# CHECK-NEXT:  -      -      -      -      -     1.00    -     flat_load_dword v5, v[0:1] offset:24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v0, s0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v1, s1
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v6, s6
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v7, s7
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v8, s8
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v9, s9
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v10, s10
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v11, s11
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v12, s12
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v13, s13
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v14, s14
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v15, s15
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v16, s16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v17, s17
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v18, s18
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v19, s19
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v20, s20
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v21, s21
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v22, s22
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v23, s23
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v24, s24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v25, s25
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v26, s26
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v27, s27
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v28, s28
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v29, s29
+# CHECK-NEXT:  -      -      -     1.00    -      -      -     s_waitcnt vmcnt(0) lgkmcnt(0)
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   s_load_dwordx2 s[2:3], s[0:1], 0x24
+# CHECK-NEXT: [0,1]     .DeeeeE   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   s_load_dwordx2 s[0:1], s[0:1], 0x2c
+# CHECK-NEXT: [0,2]     .    .DE  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   s_waitcnt lgkmcnt(0)
+# CHECK-NEXT: [0,3]     .    . DE .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v0, s2
+# CHECK-NEXT: [0,4]     .    .  DE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v1, s3
+# CHECK-NEXT: [0,5]     .    .   DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   flat_load_dword v2, v[0:1]
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   flat_load_dword v3, v[0:1] offset:8
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   flat_load_dword v4, v[0:1] offset:16
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE.   flat_load_dword v5, v[0:1] offset:24
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    DE   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v0, s0
+# CHECK-NEXT: [0,10]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .DE  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v1, s1
+# CHECK-NEXT: [0,11]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    . DE .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v6, s6
+# CHECK-NEXT: [0,12]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  DE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v7, s7
+# CHECK-NEXT: [0,13]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   DE    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v8, s8
+# CHECK-NEXT: [0,14]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    DE   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v9, s9
+# CHECK-NEXT: [0,15]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .DE  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v10, s10
+# CHECK-NEXT: [0,16]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    . DE .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v11, s11
+# CHECK-NEXT: [0,17]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  DE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v12, s12
+# CHECK-NEXT: [0,18]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   DE    .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v13, s13
+# CHECK-NEXT: [0,19]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    DE   .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v14, s14
+# CHECK-NEXT: [0,20]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .DE  .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v15, s15
+# CHECK-NEXT: [0,21]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    . DE .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v16, s16
+# CHECK-NEXT: [0,22]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  DE.    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v17, s17
+# CHECK-NEXT: [0,23]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   DE    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v18, s18
+# CHECK-NEXT: [0,24]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    DE   .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v19, s19
+# CHECK-NEXT: [0,25]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .DE  .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v20, s20
+# CHECK-NEXT: [0,26]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    . DE .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v21, s21
+# CHECK-NEXT: [0,27]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  DE.    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v22, s22
+# CHECK-NEXT: [0,28]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   DE    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v23, s23
+# CHECK-NEXT: [0,29]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    DE   .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v24, s24
+# CHECK-NEXT: [0,30]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .DE  .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v25, s25
+# CHECK-NEXT: [0,31]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    . DE .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v26, s26
+# CHECK-NEXT: [0,32]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  DE.    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v27, s27
+# CHECK-NEXT: [0,33]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   DE    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v28, s28
+# CHECK-NEXT: [0,34]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    DE   .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v29, s29
+# CHECK-NEXT: [0,35]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   DE   s_waitcnt vmcnt(0) lgkmcnt(0)
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       s_load_dwordx2 s[2:3], s[0:1], 0x24
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       s_load_dwordx2 s[0:1], s[0:1], 0x2c
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       s_waitcnt lgkmcnt(0)
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       v_mov_b32_e32 v0, s2
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       v_mov_b32_e32 v1, s3
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       flat_load_dword v2, v[0:1]
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       flat_load_dword v3, v[0:1] offset:8
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       flat_load_dword v4, v[0:1] offset:16
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       flat_load_dword v5, v[0:1] offset:24
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       v_mov_b32_e32 v0, s0
+# CHECK-NEXT: 10.    1     0.0    0.0    0.0       v_mov_b32_e32 v1, s1
+# CHECK-NEXT: 11.    1     0.0    0.0    0.0       v_mov_b32_e32 v6, s6
+# CHECK-NEXT: 12.    1     0.0    0.0    0.0       v_mov_b32_e32 v7, s7
+# CHECK-NEXT: 13.    1     0.0    0.0    0.0       v_mov_b32_e32 v8, s8
+# CHECK-NEXT: 14.    1     0.0    0.0    0.0       v_mov_b32_e32 v9, s9
+# CHECK-NEXT: 15.    1     0.0    0.0    0.0       v_mov_b32_e32 v10, s10
+# CHECK-NEXT: 16.    1     0.0    0.0    0.0       v_mov_b32_e32 v11, s11
+# CHECK-NEXT: 17.    1     0.0    0.0    0.0       v_mov_b32_e32 v12, s12
+# CHECK-NEXT: 18.    1     0.0    0.0    0.0       v_mov_b32_e32 v13, s13
+# CHECK-NEXT: 19.    1     0.0    0.0    0.0       v_mov_b32_e32 v14, s14
+# CHECK-NEXT: 20.    1     0.0    0.0    0.0       v_mov_b32_e32 v15, s15
+# CHECK-NEXT: 21.    1     0.0    0.0    0.0       v_mov_b32_e32 v16, s16
+# CHECK-NEXT: 22.    1     0.0    0.0    0.0       v_mov_b32_e32 v17, s17
+# CHECK-NEXT: 23.    1     0.0    0.0    0.0       v_mov_b32_e32 v18, s18
+# CHECK-NEXT: 24.    1     0.0    0.0    0.0       v_mov_b32_e32 v19, s19
+# CHECK-NEXT: 25.    1     0.0    0.0    0.0       v_mov_b32_e32 v20, s20
+# CHECK-NEXT: 26.    1     0.0    0.0    0.0       v_mov_b32_e32 v21, s21
+# CHECK-NEXT: 27.    1     0.0    0.0    0.0       v_mov_b32_e32 v22, s22
+# CHECK-NEXT: 28.    1     0.0    0.0    0.0       v_mov_b32_e32 v23, s23
+# CHECK-NEXT: 29.    1     0.0    0.0    0.0       v_mov_b32_e32 v24, s24
+# CHECK-NEXT: 30.    1     0.0    0.0    0.0       v_mov_b32_e32 v25, s25
+# CHECK-NEXT: 31.    1     0.0    0.0    0.0       v_mov_b32_e32 v26, s26
+# CHECK-NEXT: 32.    1     0.0    0.0    0.0       v_mov_b32_e32 v27, s27
+# CHECK-NEXT: 33.    1     0.0    0.0    0.0       v_mov_b32_e32 v28, s28
+# CHECK-NEXT: 34.    1     0.0    0.0    0.0       v_mov_b32_e32 v29, s29
+# CHECK-NEXT: 35.    1     0.0    0.0    0.0       s_waitcnt vmcnt(0) lgkmcnt(0)
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>