[llvm] 292da93 - [MCA] Disable RCU for InOrderIssueStage

Andrew Savonichev via llvm-commits llvm-commits at lists.llvm.org
Wed Mar 24 03:55:52 PDT 2021


Author: Andrew Savonichev
Date: 2021-03-24T13:54:04+03:00
New Revision: 292da93d59a3688ffc95c10de7986472242e8f1d

URL: https://github.com/llvm/llvm-project/commit/292da93d59a3688ffc95c10de7986472242e8f1d
DIFF: https://github.com/llvm/llvm-project/commit/292da93d59a3688ffc95c10de7986472242e8f1d.diff

LOG: [MCA] Disable RCU for InOrderIssueStage

This is a follow-up for:
D98604 [MCA] Ensure that writes occur in-order

When instructions are aligned by the order of writes, they retire
in-order naturally. There is no need for an RCU, so it is disabled.

Differential Revision: https://reviews.llvm.org/D98628

Added: 
    

Modified: 
    llvm/docs/CommandGuide/llvm-mca.rst
    llvm/include/llvm/MCA/Stages/InOrderIssueStage.h
    llvm/include/llvm/MCA/Stages/RetireStage.h
    llvm/lib/MCA/Context.cpp
    llvm/lib/MCA/HardwareUnits/RetireControlUnit.cpp
    llvm/lib/MCA/Stages/InOrderIssueStage.cpp
    llvm/lib/MCA/Stages/RetireStage.cpp
    llvm/lib/Target/AArch64/AArch64SchedA55.td
    llvm/test/tools/llvm-mca/AArch64/Cortex/A55-add-sequence.s
    llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s
    llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s
    llvm/test/tools/llvm-mca/AArch64/Cortex/A55-in-order-retire.s
    llvm/test/tools/llvm-mca/AArch64/Cortex/A55-out-of-order-retire.s
    llvm/test/tools/llvm-mca/AMDGPU/gfx10-add-sequence.s
    llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s
    llvm/test/tools/llvm-mca/ARM/m7-negative-readadvance.s
    llvm/tools/llvm-mca/Views/TimelineView.cpp
    llvm/tools/llvm-mca/llvm-mca.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/docs/CommandGuide/llvm-mca.rst b/llvm/docs/CommandGuide/llvm-mca.rst
index 1229fd934c80..9e40e5d9e4f5 100644
--- a/llvm/docs/CommandGuide/llvm-mca.rst
+++ b/llvm/docs/CommandGuide/llvm-mca.rst
@@ -975,7 +975,6 @@ met. Multiple instructions can be issued in one cycle according to the value of
 the ``IssueWidth`` parameter in LLVM's scheduling model.
 
 Once issued, an instruction is moved to ``IssuedInst`` set until it is ready to
-retire. If ``RetireControlUnit`` is defined in the LLVM's scheduling model,
-:program:`llvm-mca` ensures that instructions are retired in-order. However, an
-instruction is allowed to retire out-of-order if ``RetireOOO`` property is true
-for at least one of its writes.
+retire. :program:`llvm-mca` ensures that writes are committed in-order. However,
+an instruction is allowed to commit writes and retire out-of-order if
+``RetireOOO`` property is true for at least one of its writes.

diff  --git a/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h b/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h
index 867a6c1df3c5..e3aec7fb78ca 100644
--- a/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h
+++ b/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h
@@ -27,12 +27,10 @@ class MCSubtargetInfo;
 namespace mca {
 class RegisterFile;
 class ResourceManager;
-struct RetireControlUnit;
 
 class InOrderIssueStage final : public Stage {
   const MCSchedModel &SM;
   const MCSubtargetInfo &STI;
-  RetireControlUnit &RCU;
   RegisterFile &PRF;
   std::unique_ptr<ResourceManager> RM;
 
@@ -67,14 +65,16 @@ class InOrderIssueStage final : public Stage {
   Error tryIssue(InstRef &IR, unsigned *StallCycles);
 
   /// Update status of instructions from IssuedInst.
-  Error updateIssuedInst();
+  void updateIssuedInst();
+
+  /// Retire instruction once it is executed.
+  void retireInstruction(InstRef &IR);
 
 public:
-  InOrderIssueStage(RetireControlUnit &RCU, RegisterFile &PRF,
-                    const MCSchedModel &SM, const MCSubtargetInfo &STI)
-      : SM(SM), STI(STI), RCU(RCU), PRF(PRF),
-        RM(std::make_unique<ResourceManager>(SM)), NumIssued(0),
-        StallCyclesLeft(0), Bandwidth(0), LastWriteBackCycle(0) {}
+  InOrderIssueStage(RegisterFile &PRF, const MCSchedModel &SM,
+                    const MCSubtargetInfo &STI)
+      : SM(SM), STI(STI), PRF(PRF), RM(std::make_unique<ResourceManager>(SM)),
+        NumIssued(0), StallCyclesLeft(0), Bandwidth(0), LastWriteBackCycle(0) {}
 
   bool isAvailable(const InstRef &) const override;
   bool hasWorkToComplete() const override;

diff  --git a/llvm/include/llvm/MCA/Stages/RetireStage.h b/llvm/include/llvm/MCA/Stages/RetireStage.h
index 27fb9c31d7cd..b635a01db85e 100644
--- a/llvm/include/llvm/MCA/Stages/RetireStage.h
+++ b/llvm/include/llvm/MCA/Stages/RetireStage.h
@@ -30,7 +30,6 @@ class RetireStage final : public Stage {
   RetireControlUnit &RCU;
   RegisterFile &PRF;
   LSUnitBase &LSU;
-  SmallVector<InstRef, 4> RetireInst;
 
   RetireStage(const RetireStage &Other) = delete;
   RetireStage &operator=(const RetireStage &Other) = delete;
@@ -39,9 +38,7 @@ class RetireStage final : public Stage {
   RetireStage(RetireControlUnit &R, RegisterFile &F, LSUnitBase &LS)
       : Stage(), RCU(R), PRF(F), LSU(LS) {}
 
-  bool hasWorkToComplete() const override {
-    return !RCU.isEmpty() || !RetireInst.empty();
-  }
+  bool hasWorkToComplete() const override { return !RCU.isEmpty(); }
   Error cycleStart() override;
   Error cycleEnd() override;
   Error execute(InstRef &IR) override;

diff  --git a/llvm/lib/MCA/Context.cpp b/llvm/lib/MCA/Context.cpp
index 250ebebefe7a..8f5addbe6715 100644
--- a/llvm/lib/MCA/Context.cpp
+++ b/llvm/lib/MCA/Context.cpp
@@ -71,23 +71,16 @@ Context::createDefaultPipeline(const PipelineOptions &Opts, SourceMgr &SrcMgr) {
 std::unique_ptr<Pipeline>
 Context::createInOrderPipeline(const PipelineOptions &Opts, SourceMgr &SrcMgr) {
   const MCSchedModel &SM = STI.getSchedModel();
-  auto RCU = std::make_unique<RetireControlUnit>(SM);
   auto PRF = std::make_unique<RegisterFile>(SM, MRI, Opts.RegisterFileSize);
-  auto LSU = std::make_unique<LSUnit>(SM, Opts.LoadQueueSize,
-                                      Opts.StoreQueueSize, Opts.AssumeNoAlias);
 
   auto Entry = std::make_unique<EntryStage>(SrcMgr);
-  auto InOrderIssue = std::make_unique<InOrderIssueStage>(*RCU, *PRF, SM, STI);
-  auto Retire = std::make_unique<RetireStage>(*RCU, *PRF, *LSU);
+  auto InOrderIssue = std::make_unique<InOrderIssueStage>(*PRF, SM, STI);
 
   auto StagePipeline = std::make_unique<Pipeline>();
   StagePipeline->appendStage(std::move(Entry));
   StagePipeline->appendStage(std::move(InOrderIssue));
-  StagePipeline->appendStage(std::move(Retire));
 
-  addHardwareUnit(std::move(RCU));
   addHardwareUnit(std::move(PRF));
-  addHardwareUnit(std::move(LSU));
 
   return StagePipeline;
 }

diff  --git a/llvm/lib/MCA/HardwareUnits/RetireControlUnit.cpp b/llvm/lib/MCA/HardwareUnits/RetireControlUnit.cpp
index 812109f26684..9297f0c4fd7b 100644
--- a/llvm/lib/MCA/HardwareUnits/RetireControlUnit.cpp
+++ b/llvm/lib/MCA/HardwareUnits/RetireControlUnit.cpp
@@ -23,6 +23,8 @@ RetireControlUnit::RetireControlUnit(const MCSchedModel &SM)
     : NextAvailableSlotIdx(0), CurrentInstructionSlotIdx(0),
       AvailableEntries(SM.isOutOfOrder() ? SM.MicroOpBufferSize : 0),
       MaxRetirePerCycle(0) {
+  assert(SM.isOutOfOrder() &&
+         "RetireControlUnit is not available for in-order processors");
   // Check if the scheduling model provides extra information about the machine
   // processor. If so, then use that information to set the reorder buffer size
   // and the maximum number of instructions retired per cycle.
@@ -33,17 +35,12 @@ RetireControlUnit::RetireControlUnit(const MCSchedModel &SM)
     MaxRetirePerCycle = EPI.MaxRetirePerCycle;
   }
   NumROBEntries = AvailableEntries;
-  if (!SM.isOutOfOrder() && !NumROBEntries)
-    return;
   assert(NumROBEntries && "Invalid reorder buffer size!");
   Queue.resize(2 * NumROBEntries);
 }
 
 // Reserves a number of slots, and returns a new token.
 unsigned RetireControlUnit::dispatch(const InstRef &IR) {
-  if (!NumROBEntries)
-    return UnhandledTokenID;
-
   const Instruction &Inst = *IR.getInstruction();
   unsigned Entries = normalizeQuantity(Inst.getNumMicroOps());
   assert((AvailableEntries >= Entries) && "Reorder Buffer unavailable!");

diff  --git a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp
index cf536979578b..2d2a75cc99a7 100644
--- a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp
+++ b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp
@@ -182,7 +182,7 @@ static void addRegisterReadWrite(RegisterFile &PRF, Instruction &IS,
     PRF.addRegisterWrite(WriteRef(SourceIndex, &WS), UsedRegs);
 }
 
-static void notifyInstructionExecute(
+static void notifyInstructionIssue(
     const InstRef &IR,
     const SmallVectorImpl<std::pair<ResourceRef, ResourceCycles>> &UsedRes,
     const Stage &S) {
@@ -205,28 +205,11 @@ static void notifyInstructionDispatch(const InstRef &IR, unsigned Ops,
 }
 
 llvm::Error InOrderIssueStage::execute(InstRef &IR) {
-  Instruction &IS = *IR.getInstruction();
-  const InstrDesc &Desc = IS.getDesc();
-
-  unsigned RCUTokenID = RetireControlUnit::UnhandledTokenID;
-  if (!Desc.RetireOOO)
-    RCUTokenID = RCU.dispatch(IR);
-  IS.dispatch(RCUTokenID);
-
-  if (Desc.EndGroup) {
-    Bandwidth = 0;
-  } else {
-    unsigned NumMicroOps = IR.getInstruction()->getNumMicroOps();
-    assert(Bandwidth >= NumMicroOps);
-    Bandwidth -= NumMicroOps;
-  }
-
   if (llvm::Error E = tryIssue(IR, &StallCyclesLeft))
     return E;
 
   if (StallCyclesLeft) {
     StalledInst = IR;
-    Bandwidth = 0;
   }
 
   return llvm::ErrorSuccess();
@@ -235,20 +218,26 @@ llvm::Error InOrderIssueStage::execute(InstRef &IR) {
 llvm::Error InOrderIssueStage::tryIssue(InstRef &IR, unsigned *StallCycles) {
   Instruction &IS = *IR.getInstruction();
   unsigned SourceIndex = IR.getSourceIndex();
+  const InstrDesc &Desc = IS.getDesc();
 
   if (!canExecute(IR, StallCycles)) {
     LLVM_DEBUG(dbgs() << "[E] Stalled #" << IR << " for " << *StallCycles
                       << " cycles\n");
+    Bandwidth = 0;
     return llvm::ErrorSuccess();
   }
 
+  unsigned RCUTokenID = RetireControlUnit::UnhandledTokenID;
+  IS.dispatch(RCUTokenID);
+
   SmallVector<unsigned, 4> UsedRegs(PRF.getNumRegisterFiles());
   addRegisterReadWrite(PRF, IS, SourceIndex, STI, UsedRegs);
 
-  notifyInstructionDispatch(IR, IS.getDesc().NumMicroOps, UsedRegs, *this);
+  unsigned NumMicroOps = IS.getNumMicroOps();
+  notifyInstructionDispatch(IR, NumMicroOps, UsedRegs, *this);
 
   SmallVector<std::pair<ResourceRef, ResourceCycles>, 4> UsedResources;
-  RM->issueInstruction(IS.getDesc(), UsedResources);
+  RM->issueInstruction(Desc, UsedResources);
   IS.execute(SourceIndex);
 
   // Replace resource masks with valid resource processor IDs.
@@ -256,10 +245,17 @@ llvm::Error InOrderIssueStage::tryIssue(InstRef &IR, unsigned *StallCycles) {
     uint64_t Mask = Use.first.first;
     Use.first.first = RM->resolveResourceMask(Mask);
   }
-  notifyInstructionExecute(IR, UsedResources, *this);
+  notifyInstructionIssue(IR, UsedResources, *this);
+
+  if (Desc.EndGroup) {
+    Bandwidth = 0;
+  } else {
+    assert(Bandwidth >= NumMicroOps);
+    Bandwidth -= NumMicroOps;
+  }
 
   IssuedInst.push_back(IR);
-  ++NumIssued;
+  NumIssued += NumMicroOps;
 
   if (!IR.getInstruction()->getDesc().RetireOOO)
     LastWriteBackCycle = findLastWriteBackCycle(IR);
@@ -267,7 +263,7 @@ llvm::Error InOrderIssueStage::tryIssue(InstRef &IR, unsigned *StallCycles) {
   return llvm::ErrorSuccess();
 }
 
-llvm::Error InOrderIssueStage::updateIssuedInst() {
+void InOrderIssueStage::updateIssuedInst() {
   // Update other instructions. Executed instructions will be retired during the
   // next cycle.
   unsigned NumExecuted = 0;
@@ -283,29 +279,37 @@ llvm::Error InOrderIssueStage::updateIssuedInst() {
       ++I;
       continue;
     }
+
+    PRF.onInstructionExecuted(&IS);
     notifyEvent<HWInstructionEvent>(
         HWInstructionEvent(HWInstructionEvent::Executed, IR));
-
     LLVM_DEBUG(dbgs() << "[E] Instruction #" << IR << " is executed\n");
     ++NumExecuted;
+
+    retireInstruction(*I);
+
     std::iter_swap(I, E - NumExecuted);
   }
 
-  // Retire instructions in the next cycle
-  if (NumExecuted) {
-    for (auto I = IssuedInst.end() - NumExecuted, E = IssuedInst.end(); I != E;
-         ++I) {
-      if (llvm::Error E = moveToTheNextStage(*I))
-        return E;
-    }
+  if (NumExecuted)
     IssuedInst.resize(IssuedInst.size() - NumExecuted);
-  }
+}
 
-  return llvm::ErrorSuccess();
+void InOrderIssueStage::retireInstruction(InstRef &IR) {
+  Instruction &IS = *IR.getInstruction();
+  IS.retire();
+
+  llvm::SmallVector<unsigned, 4> FreedRegs(PRF.getNumRegisterFiles());
+  for (const WriteState &WS : IS.getDefs())
+    PRF.removeRegisterWrite(WS, FreedRegs);
+
+  notifyEvent<HWInstructionEvent>(HWInstructionRetiredEvent(IR, FreedRegs));
+  LLVM_DEBUG(dbgs() << "[E] Retired #" << IR << " \n");
 }
 
 llvm::Error InOrderIssueStage::cycleStart() {
   NumIssued = 0;
+  Bandwidth = SM.IssueWidth;
 
   PRF.cycleStart();
 
@@ -313,8 +317,7 @@ llvm::Error InOrderIssueStage::cycleStart() {
   SmallVector<ResourceRef, 4> Freed;
   RM->cycleEvent(Freed);
 
-  if (llvm::Error E = updateIssuedInst())
-    return E;
+  updateIssuedInst();
 
   // Issue instructions scheduled for this cycle
   if (!StallCyclesLeft && StalledInst) {
@@ -325,7 +328,6 @@ llvm::Error InOrderIssueStage::cycleStart() {
   if (!StallCyclesLeft) {
     StalledInst.invalidate();
     assert(NumIssued <= SM.IssueWidth && "Overflow.");
-    Bandwidth = SM.IssueWidth - NumIssued;
   } else {
     // The instruction is still stalled, cannot issue any new instructions in
     // this cycle.

diff  --git a/llvm/lib/MCA/Stages/RetireStage.cpp b/llvm/lib/MCA/Stages/RetireStage.cpp
index 43f71c2e3642..00dbb4b0347a 100644
--- a/llvm/lib/MCA/Stages/RetireStage.cpp
+++ b/llvm/lib/MCA/Stages/RetireStage.cpp
@@ -38,13 +38,6 @@ llvm::Error RetireStage::cycleStart() {
     NumRetired++;
   }
 
-  // Retire instructions that are not controlled by the RCU
-  for (InstRef &IR : RetireInst) {
-    IR.getInstruction()->retire();
-    notifyInstructionRetired(IR);
-  }
-  RetireInst.resize(0);
-
   return llvm::ErrorSuccess();
 }
 
@@ -58,12 +51,9 @@ llvm::Error RetireStage::execute(InstRef &IR) {
 
   PRF.onInstructionExecuted(&IS);
   unsigned TokenID = IS.getRCUTokenID();
-  if (TokenID != RetireControlUnit::UnhandledTokenID) {
-    RCU.onInstructionExecuted(TokenID);
-    return llvm::ErrorSuccess();
-  }
+  assert(TokenID != RetireControlUnit::UnhandledTokenID);
+  RCU.onInstructionExecuted(TokenID);
 
-  RetireInst.push_back(IR);
   return llvm::ErrorSuccess();
 }
 

diff  --git a/llvm/lib/Target/AArch64/AArch64SchedA55.td b/llvm/lib/Target/AArch64/AArch64SchedA55.td
index ff7766f2caec..0015c27228f6 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA55.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA55.td
@@ -339,5 +339,4 @@ def : InstRW<[CortexA55WriteFSqrtHP], (instregex "^.*SQRT.*16$")>;
 def : InstRW<[CortexA55WriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
 def : InstRW<[CortexA55WriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
 
-def A55RCU : RetireControlUnit<64, 0>;
 }

diff  --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-add-sequence.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-add-sequence.s
index be817b755f77..dbcb3c53b22f 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-add-sequence.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-add-sequence.s
@@ -8,12 +8,12 @@ add      w1, w0, #4
 
 # CHECK:      Iterations:        2
 # CHECK-NEXT: Instructions:      8
-# CHECK-NEXT: Total Cycles:      10
+# CHECK-NEXT: Total Cycles:      9
 # CHECK-NEXT: Total uOps:        8
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.80
-# CHECK-NEXT: IPC:               0.80
+# CHECK-NEXT: uOps Per Cycle:    0.89
+# CHECK-NEXT: IPC:               0.89
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Instruction Info:
@@ -56,16 +56,16 @@ add      w1, w0, #4
 # CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -     add	w1, w0, #4
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DeeER.   .   add	w2, w3, #1
-# CHECK-NEXT: [0,1]     DeeER.   .   add	w4, w3, #2, lsl #12
-# CHECK-NEXT: [0,2]     .DeeER   .   add	w0, w4, #3
-# CHECK-NEXT: [0,3]     . DeeER  .   add	w1, w0, #4
-# CHECK-NEXT: [1,0]     . DeeER  .   add	w2, w3, #1
-# CHECK-NEXT: [1,1]     .  DeeER .   add	w4, w3, #2, lsl #12
-# CHECK-NEXT: [1,2]     .   DeeER.   add	w0, w4, #3
-# CHECK-NEXT: [1,3]     .    DeeER   add	w1, w0, #4
+# CHECK:      [0,0]     DeeE .  .   add	w2, w3, #1
+# CHECK-NEXT: [0,1]     DeeE .  .   add	w4, w3, #2, lsl #12
+# CHECK-NEXT: [0,2]     .DeeE.  .   add	w0, w4, #3
+# CHECK-NEXT: [0,3]     . DeeE  .   add	w1, w0, #4
+# CHECK-NEXT: [1,0]     . DeeE  .   add	w2, w3, #1
+# CHECK-NEXT: [1,1]     .  DeeE .   add	w4, w3, #2, lsl #12
+# CHECK-NEXT: [1,2]     .   DeeE.   add	w0, w4, #3
+# CHECK-NEXT: [1,3]     .    DeeE   add	w1, w0, #4
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions

diff  --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s
index a672c8c879ae..9081fb525ee2 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s
@@ -10,12 +10,12 @@ str	w0, [x21, x18, lsl #2]
 
 # CHECK:      Iterations:        2
 # CHECK-NEXT: Instructions:      12
-# CHECK-NEXT: Total Cycles:      21
+# CHECK-NEXT: Total Cycles:      20
 # CHECK-NEXT: Total uOps:        14
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.67
-# CHECK-NEXT: IPC:               0.57
+# CHECK-NEXT: uOps Per Cycle:    0.70
+# CHECK-NEXT: IPC:               0.60
 # CHECK-NEXT: Block RThroughput: 3.5
 
 # CHECK:      Instruction Info:
@@ -35,7 +35,7 @@ str	w0, [x21, x18, lsl #2]
 # CHECK-NEXT:  1      4     1.00           *            str	w0, [x21, x18, lsl #2]
 
 # CHECK:      Dynamic Dispatch Stall Cycles:
-# CHECK-NEXT: RAT     - Register unavailable:                      8  (38.1%)
+# CHECK-NEXT: RAT     - Register unavailable:                      8  (40.0%)
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
 # CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
 # CHECK-NEXT: LQ      - Load queue full:                           0
@@ -44,33 +44,22 @@ str	w0, [x21, x18, lsl #2]
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              11  (52.4%)
-# CHECK-NEXT:  1,              6  (28.6%)
-# CHECK-NEXT:  2,              4  (19.0%)
+# CHECK-NEXT:  0,              10  (50.0%)
+# CHECK-NEXT:  1,              6  (30.0%)
+# CHECK-NEXT:  2,              4  (20.0%)
 
 # CHECK:      Schedulers - number of cycles where we saw N micro opcodes issued:
 # CHECK-NEXT: [# issued], [# cycles]
-# CHECK-NEXT:  0,          11  (52.4%)
-# CHECK-NEXT:  1,          6  (28.6%)
-# CHECK-NEXT:  2,          4  (19.0%)
+# CHECK-NEXT:  0,          10  (50.0%)
+# CHECK-NEXT:  1,          6  (30.0%)
+# CHECK-NEXT:  2,          4  (20.0%)
 
 # CHECK:      Scheduler's queue usage:
 # CHECK-NEXT: No scheduler resources used.
 
-# CHECK:      Retire Control Unit - number of cycles where we saw N instructions retired:
-# CHECK-NEXT: [# retired], [# cycles]
-# CHECK-NEXT:  0,           14  (66.7%)
-# CHECK-NEXT:  1,           4  (19.0%)
-# CHECK-NEXT:  2,           1  (4.8%)
-# CHECK-NEXT:  3,           2  (9.5%)
-
-# CHECK:      Total ROB Entries:                64
-# CHECK-NEXT: Max Used ROB Entries:             6  ( 9.4% )
-# CHECK-NEXT: Average Used ROB Entries per cy:  2  ( 3.1% )
-
 # CHECK:      Register File statistics:
 # CHECK-NEXT: Total number of mappings created:    14
-# CHECK-NEXT: Max number of mappings used:         6
+# CHECK-NEXT: Max number of mappings used:         4
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0.0] - CortexA55UnitALU

diff  --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s
index 1d4e41a63c63..d49e68adc1c3 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s
@@ -10,12 +10,12 @@ str	w0, [x21, x18, lsl #2]
 
 # CHECK:      Iterations:        2
 # CHECK-NEXT: Instructions:      12
-# CHECK-NEXT: Total Cycles:      21
+# CHECK-NEXT: Total Cycles:      20
 # CHECK-NEXT: Total uOps:        14
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.67
-# CHECK-NEXT: IPC:               0.57
+# CHECK-NEXT: uOps Per Cycle:    0.70
+# CHECK-NEXT: IPC:               0.60
 # CHECK-NEXT: Block RThroughput: 3.5
 
 # CHECK:      Instruction Info:
@@ -35,7 +35,7 @@ str	w0, [x21, x18, lsl #2]
 # CHECK-NEXT:  1      4     1.00           *            str	w0, [x21, x18, lsl #2]
 
 # CHECK:      Dynamic Dispatch Stall Cycles:
-# CHECK-NEXT: RAT     - Register unavailable:                      8  (38.1%)
+# CHECK-NEXT: RAT     - Register unavailable:                      8  (40.0%)
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
 # CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
 # CHECK-NEXT: LQ      - Load queue full:                           0
@@ -44,33 +44,22 @@ str	w0, [x21, x18, lsl #2]
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              11  (52.4%)
-# CHECK-NEXT:  1,              6  (28.6%)
-# CHECK-NEXT:  2,              4  (19.0%)
+# CHECK-NEXT:  0,              10  (50.0%)
+# CHECK-NEXT:  1,              6  (30.0%)
+# CHECK-NEXT:  2,              4  (20.0%)
 
 # CHECK:      Schedulers - number of cycles where we saw N micro opcodes issued:
 # CHECK-NEXT: [# issued], [# cycles]
-# CHECK-NEXT:  0,          11  (52.4%)
-# CHECK-NEXT:  1,          6  (28.6%)
-# CHECK-NEXT:  2,          4  (19.0%)
+# CHECK-NEXT:  0,          10  (50.0%)
+# CHECK-NEXT:  1,          6  (30.0%)
+# CHECK-NEXT:  2,          4  (20.0%)
 
 # CHECK:      Scheduler's queue usage:
 # CHECK-NEXT: No scheduler resources used.
 
-# CHECK:      Retire Control Unit - number of cycles where we saw N instructions retired:
-# CHECK-NEXT: [# retired], [# cycles]
-# CHECK-NEXT:  0,           14  (66.7%)
-# CHECK-NEXT:  1,           4  (19.0%)
-# CHECK-NEXT:  2,           1  (4.8%)
-# CHECK-NEXT:  3,           2  (9.5%)
-
-# CHECK:      Total ROB Entries:                64
-# CHECK-NEXT: Max Used ROB Entries:             6  ( 9.4% )
-# CHECK-NEXT: Average Used ROB Entries per cy:  2  ( 3.1% )
-
 # CHECK:      Register File statistics:
 # CHECK-NEXT: Total number of mappings created:    14
-# CHECK-NEXT: Max number of mappings used:         6
+# CHECK-NEXT: Max number of mappings used:         4
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0.0] - CortexA55UnitALU
@@ -101,20 +90,20 @@ str	w0, [x21, x18, lsl #2]
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0
-
-# CHECK:      [0,0]     DeeER.    .    .    .   ldr	w4, [x2], #4
-# CHECK-NEXT: [0,1]     .DeeER    .    .    .   ldr	w5, [x3]
-# CHECK-NEXT: [0,2]     .   DeeeER.    .    .   madd	w0, w5, w4, w0
-# CHECK-NEXT: [0,3]     .    DeeER.    .    .   add	x3, x3, x13
-# CHECK-NEXT: [0,4]     .    DeeER.    .    .   subs	x1, x1, #1
-# CHECK-NEXT: [0,5]     .    . DeeeER  .    .   str	w0, [x21, x18, lsl #2]
-# CHECK-NEXT: [1,0]     .    .  DeeER  .    .   ldr	w4, [x2], #4
-# CHECK-NEXT: [1,1]     .    .   DeeER .    .   ldr	w5, [x3]
-# CHECK-NEXT: [1,2]     .    .    . DeeeER  .   madd	w0, w5, w4, w0
-# CHECK-NEXT: [1,3]     .    .    .  DeeER  .   add	x3, x3, x13
-# CHECK-NEXT: [1,4]     .    .    .  DeeER  .   subs	x1, x1, #1
-# CHECK-NEXT: [1,5]     .    .    .    DeeeER   str	w0, [x21, x18, lsl #2]
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeE .    .    .   .   ldr	w4, [x2], #4
+# CHECK-NEXT: [0,1]     .DeeE.    .    .   .   ldr	w5, [x3]
+# CHECK-NEXT: [0,2]     .   DeeeE .    .   .   madd	w0, w5, w4, w0
+# CHECK-NEXT: [0,3]     .    DeeE .    .   .   add	x3, x3, x13
+# CHECK-NEXT: [0,4]     .    DeeE .    .   .   subs	x1, x1, #1
+# CHECK-NEXT: [0,5]     .    . DeeeE   .   .   str	w0, [x21, x18, lsl #2]
+# CHECK-NEXT: [1,0]     .    .  DeeE   .   .   ldr	w4, [x2], #4
+# CHECK-NEXT: [1,1]     .    .   DeeE  .   .   ldr	w5, [x3]
+# CHECK-NEXT: [1,2]     .    .    . DeeeE  .   madd	w0, w5, w4, w0
+# CHECK-NEXT: [1,3]     .    .    .  DeeE  .   add	x3, x3, x13
+# CHECK-NEXT: [1,4]     .    .    .  DeeE  .   subs	x1, x1, #1
+# CHECK-NEXT: [1,5]     .    .    .    DeeeE   str	w0, [x21, x18, lsl #2]
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions

diff  --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-in-order-retire.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-in-order-retire.s
index de5dbaa3490c..c35332420549 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-in-order-retire.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-in-order-retire.s
@@ -10,12 +10,12 @@ add	w7, w9, w0
 
 # CHECK:      Iterations:        2
 # CHECK-NEXT: Instructions:      12
-# CHECK-NEXT: Total Cycles:      20
+# CHECK-NEXT: Total Cycles:      19
 # CHECK-NEXT: Total uOps:        12
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.60
+# CHECK-NEXT: uOps Per Cycle:    0.63
+# CHECK-NEXT: IPC:               0.63
 # CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Instruction Info:
@@ -40,37 +40,26 @@ add	w7, w9, w0
 # CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
 # CHECK-NEXT: LQ      - Load queue full:                           0
 # CHECK-NEXT: SQ      - Store queue full:                          0
-# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 1  (5.0%)
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 1  (5.3%)
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              12  (60.0%)
-# CHECK-NEXT:  1,              4  (20.0%)
-# CHECK-NEXT:  2,              4  (20.0%)
+# CHECK-NEXT:  0,              11  (57.9%)
+# CHECK-NEXT:  1,              4  (21.1%)
+# CHECK-NEXT:  2,              4  (21.1%)
 
 # CHECK:      Schedulers - number of cycles where we saw N micro opcodes issued:
 # CHECK-NEXT: [# issued], [# cycles]
-# CHECK-NEXT:  0,          12  (60.0%)
-# CHECK-NEXT:  1,          4  (20.0%)
-# CHECK-NEXT:  2,          4  (20.0%)
+# CHECK-NEXT:  0,          11  (57.9%)
+# CHECK-NEXT:  1,          4  (21.1%)
+# CHECK-NEXT:  2,          4  (21.1%)
 
 # CHECK:      Scheduler's queue usage:
 # CHECK-NEXT: No scheduler resources used.
 
-# CHECK:      Retire Control Unit - number of cycles where we saw N instructions retired:
-# CHECK-NEXT: [# retired], [# cycles]
-# CHECK-NEXT:  0,           14  (70.0%)
-# CHECK-NEXT:  1,           2  (10.0%)
-# CHECK-NEXT:  2,           2  (10.0%)
-# CHECK-NEXT:  3,           2  (10.0%)
-
-# CHECK:      Total ROB Entries:                64
-# CHECK-NEXT: Max Used ROB Entries:             7  ( 10.9% )
-# CHECK-NEXT: Average Used ROB Entries per cy:  2  ( 3.1% )
-
 # CHECK:      Register File statistics:
 # CHECK-NEXT: Total number of mappings created:    12
-# CHECK-NEXT: Max number of mappings used:         7
+# CHECK-NEXT: Max number of mappings used:         6
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0.0] - CortexA55UnitALU
@@ -100,21 +89,21 @@ add	w7, w9, w0
 # CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     add	w7, w9, w0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
+# CHECK-NEXT:                     012345678
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.    .   .   sdiv	w12, w21, w0
-# CHECK-NEXT: [0,1]     .    DeeER.    .   .   add	w8, w8, #1
-# CHECK-NEXT: [0,2]     .    DeeER.    .   .   add	w1, w2, w0
-# CHECK-NEXT: [0,3]     .    .DeeER    .   .   add	w3, w4, #1
-# CHECK-NEXT: [0,4]     .    .DeeER    .   .   add	w5, w6, w0
-# CHECK-NEXT: [0,5]     .    . DeeER   .   .   add	w7, w9, w0
-# CHECK-NEXT: [1,0]     .    .  DeeeeeeeER .   sdiv	w12, w21, w0
-# CHECK-NEXT: [1,1]     .    .    .  DeeER .   add	w8, w8, #1
-# CHECK-NEXT: [1,2]     .    .    .  DeeER .   add	w1, w2, w0
-# CHECK-NEXT: [1,3]     .    .    .   DeeER.   add	w3, w4, #1
-# CHECK-NEXT: [1,4]     .    .    .   DeeER.   add	w5, w6, w0
-# CHECK-NEXT: [1,5]     .    .    .    DeeER   add	w7, w9, w0
+# CHECK:      [0,0]     DeeeeeeeE .    .  .   sdiv	w12, w21, w0
+# CHECK-NEXT: [0,1]     .    DeeE .    .  .   add	w8, w8, #1
+# CHECK-NEXT: [0,2]     .    DeeE .    .  .   add	w1, w2, w0
+# CHECK-NEXT: [0,3]     .    .DeeE.    .  .   add	w3, w4, #1
+# CHECK-NEXT: [0,4]     .    .DeeE.    .  .   add	w5, w6, w0
+# CHECK-NEXT: [0,5]     .    . DeeE    .  .   add	w7, w9, w0
+# CHECK-NEXT: [1,0]     .    .  DeeeeeeeE .   sdiv	w12, w21, w0
+# CHECK-NEXT: [1,1]     .    .    .  DeeE .   add	w8, w8, #1
+# CHECK-NEXT: [1,2]     .    .    .  DeeE .   add	w1, w2, w0
+# CHECK-NEXT: [1,3]     .    .    .   DeeE.   add	w3, w4, #1
+# CHECK-NEXT: [1,4]     .    .    .   DeeE.   add	w5, w6, w0
+# CHECK-NEXT: [1,5]     .    .    .    DeeE   add	w7, w9, w0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions

diff  --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-out-of-order-retire.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-out-of-order-retire.s
index 6231116f25ac..8935d254cd98 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-out-of-order-retire.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-out-of-order-retire.s
@@ -10,12 +10,12 @@ add	w7, w9, w0
 
 # CHECK:      Iterations:        2
 # CHECK-NEXT: Instructions:      12
-# CHECK-NEXT: Total Cycles:      25
+# CHECK-NEXT: Total Cycles:      24
 # CHECK-NEXT: Total uOps:        12
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.48
-# CHECK-NEXT: IPC:               0.48
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Instruction Info:
@@ -40,31 +40,21 @@ add	w7, w9, w0
 # CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
 # CHECK-NEXT: LQ      - Load queue full:                           0
 # CHECK-NEXT: SQ      - Store queue full:                          0
-# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 7  (28.0%)
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 7  (29.2%)
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              19  (76.0%)
-# CHECK-NEXT:  2,              6  (24.0%)
+# CHECK-NEXT:  0,              18  (75.0%)
+# CHECK-NEXT:  2,              6  (25.0%)
 
 # CHECK:      Schedulers - number of cycles where we saw N micro opcodes issued:
 # CHECK-NEXT: [# issued], [# cycles]
-# CHECK-NEXT:  0,          19  (76.0%)
-# CHECK-NEXT:  2,          6  (24.0%)
+# CHECK-NEXT:  0,          18  (75.0%)
+# CHECK-NEXT:  2,          6  (25.0%)
 
 # CHECK:      Scheduler's queue usage:
 # CHECK-NEXT: No scheduler resources used.
 
-# CHECK:      Retire Control Unit - number of cycles where we saw N instructions retired:
-# CHECK-NEXT: [# retired], [# cycles]
-# CHECK-NEXT:  0,           18  (72.0%)
-# CHECK-NEXT:  1,           2  (8.0%)
-# CHECK-NEXT:  2,           5  (20.0%)
-
-# CHECK:      Total ROB Entries:                64
-# CHECK-NEXT: Max Used ROB Entries:             7  ( 10.9% )
-# CHECK-NEXT: Average Used ROB Entries per cy:  2  ( 3.1% )
-
 # CHECK:      Register File statistics:
 # CHECK-NEXT: Total number of mappings created:    12
 # CHECK-NEXT: Max number of mappings used:         7
@@ -98,20 +88,20 @@ add	w7, w9, w0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234
-
-# CHECK:      [0,0]     DeeeeeeeeeeeeER.    .   .   fdiv	s1, s2, s3
-# CHECK-NEXT: [0,1]     DeeER.    .    .    .   .   add	w8, w8, #1
-# CHECK-NEXT: [0,2]     .DeeER    .    .    .   .   add	w1, w2, w0
-# CHECK-NEXT: [0,3]     .DeeER    .    .    .   .   add	w3, w4, #1
-# CHECK-NEXT: [0,4]     . DeeER   .    .    .   .   add	w5, w6, w0
-# CHECK-NEXT: [0,5]     . DeeER   .    .    .   .   add	w7, w9, w0
-# CHECK-NEXT: [1,0]     .    .    DeeeeeeeeeeeeER   fdiv	s1, s2, s3
-# CHECK-NEXT: [1,1]     .    .    DeeER.    .   .   add	w8, w8, #1
-# CHECK-NEXT: [1,2]     .    .    .DeeER    .   .   add	w1, w2, w0
-# CHECK-NEXT: [1,3]     .    .    .DeeER    .   .   add	w3, w4, #1
-# CHECK-NEXT: [1,4]     .    .    . DeeER   .   .   add	w5, w6, w0
-# CHECK-NEXT: [1,5]     .    .    . DeeER   .   .   add	w7, w9, w0
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeeeeeE .    .  .   fdiv	s1, s2, s3
+# CHECK-NEXT: [0,1]     DeeE .    .    .    .  .   add	w8, w8, #1
+# CHECK-NEXT: [0,2]     .DeeE.    .    .    .  .   add	w1, w2, w0
+# CHECK-NEXT: [0,3]     .DeeE.    .    .    .  .   add	w3, w4, #1
+# CHECK-NEXT: [0,4]     . DeeE    .    .    .  .   add	w5, w6, w0
+# CHECK-NEXT: [0,5]     . DeeE    .    .    .  .   add	w7, w9, w0
+# CHECK-NEXT: [1,0]     .    .    DeeeeeeeeeeeeE   fdiv	s1, s2, s3
+# CHECK-NEXT: [1,1]     .    .    DeeE .    .  .   add	w8, w8, #1
+# CHECK-NEXT: [1,2]     .    .    .DeeE.    .  .   add	w1, w2, w0
+# CHECK-NEXT: [1,3]     .    .    .DeeE.    .  .   add	w3, w4, #1
+# CHECK-NEXT: [1,4]     .    .    . DeeE    .  .   add	w5, w6, w0
+# CHECK-NEXT: [1,5]     .    .    . DeeE    .  .   add	w7, w9, w0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions

diff  --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx10-add-sequence.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx10-add-sequence.s
index 64ee3bdc8355..be57731389ba 100644
--- a/llvm/test/tools/llvm-mca/AMDGPU/gfx10-add-sequence.s
+++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx10-add-sequence.s
@@ -7,12 +7,12 @@ v_add_f32 v2, v1, v0
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      3
-# CHECK-NEXT: Total Cycles:      13
+# CHECK-NEXT: Total Cycles:      12
 # CHECK-NEXT: Total uOps:        3
 
 # CHECK:      Dispatch Width:    1
-# CHECK-NEXT: uOps Per Cycle:    0.23
-# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.25
 # CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Instruction Info:
@@ -48,12 +48,12 @@ v_add_f32 v2, v1, v0
 # CHECK-NEXT:  -      -      -     1.00    -     1.00    -     v_add_f32_e32 v2, v1, v0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   . .   v_add_f32_e32 v0, v0, v0
-# CHECK-NEXT: [0,1]     .DeeeeER  . .   v_add_f32_e32 v1, v1, v1
-# CHECK-NEXT: [0,2]     .    .DeeeeER   v_add_f32_e32 v2, v1, v0
+# CHECK:      [0,0]     DeeeeE    ..   v_add_f32_e32 v0, v0, v0
+# CHECK-NEXT: [0,1]     .DeeeeE   ..   v_add_f32_e32 v1, v1, v1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   v_add_f32_e32 v2, v1, v0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions

diff  --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s
index 906ce86b98e5..28d811f01806 100644
--- a/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s
+++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s
@@ -42,7 +42,7 @@ v_sqrt_f64 v[4:5], v[4:5]
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      27
-# CHECK-NEXT: Total Cycles:      205
+# CHECK-NEXT: Total Cycles:      204
 # CHECK-NEXT: Total uOps:        27
 
 # CHECK:      Dispatch Width:    1
@@ -134,19 +134,19 @@ v_sqrt_f64 v[4:5], v[4:5]
 # CHECK-NEXT:                     0123456789          0123456789          0123456789          0
 # CHECK-NEXT: Index     0123456789          0123456789          0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeeeeeeeeeeeeeeER .    .    .    .    .    .    .    .    .    .   v_cvt_i32_f64_e32 v0, v[0:1]
-# CHECK-NEXT: [0,1]     .DeeeeeeeeeeeeeeeeeeeeeER.    .    .    .    .    .    .    .    .    .   v_cvt_f64_i32_e32 v[2:3], v2
-# CHECK-NEXT: [0,2]     . DeeeeeeeeeeeeeeeeeeeeeER    .    .    .    .    .    .    .    .    .   v_cvt_f32_f64_e32 v4, v[4:5]
-# CHECK-NEXT: [0,3]     .  DeeeeeeeeeeeeeeeeeeeeeER   .    .    .    .    .    .    .    .    .   v_cvt_f64_f32_e32 v[6:7], v6
-# CHECK-NEXT: [0,4]     .   DeeeeeeeeeeeeeeeeeeeeeER  .    .    .    .    .    .    .    .    .   v_cvt_u32_f64_e32 v8, v[8:9]
-# CHECK-NEXT: [0,5]     .    DeeeeeeeeeeeeeeeeeeeeeER .    .    .    .    .    .    .    .    .   v_cvt_f64_u32_e32 v[10:11], v10
-# CHECK-NEXT: [0,6]     .    .    .    .    . DeeeeeeeeeeeeeeeeeeeeeER    .    .    .    .    .   v_frexp_exp_i32_f64_e32 v0, v[0:1]
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeeeeeeeeeeeeeeeeeeeeER   .    .    .    .    .   v_frexp_mant_f64_e32 v[2:3], v[2:3]
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeeeeeeeeeeeeeeeeeeER  .    .    .    .    .   v_fract_f64_e32 v[4:5], v[4:5]
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .   DeeeeeeeeeeeeeeeeeeeeeER  .   v_trunc_f64_e32 v[0:1], v[0:1]
-# CHECK-NEXT: [0,10]    .    .    .    .    .    .    .    .    .    DeeeeeeeeeeeeeeeeeeeeeER .   v_ceil_f64_e32 v[2:3], v[2:3]
-# CHECK-NEXT: [0,11]    .    .    .    .    .    .    .    .    .    .DeeeeeeeeeeeeeeeeeeeeeER.   v_rndne_f64_e32 v[4:5], v[4:5]
-# CHECK-NEXT: [0,12]    .    .    .    .    .    .    .    .    .    . DeeeeeeeeeeeeeeeeeeeeeER   v_floor_f64_e32 v[6:7], v[6:7]
+# CHECK:      [0,0]     DeeeeeeeeeeeeeeeeeeeeeE  .    .    .    .    .    .    .    .    .    .   v_cvt_i32_f64_e32 v0, v[0:1]
+# CHECK-NEXT: [0,1]     .DeeeeeeeeeeeeeeeeeeeeeE .    .    .    .    .    .    .    .    .    .   v_cvt_f64_i32_e32 v[2:3], v2
+# CHECK-NEXT: [0,2]     . DeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .   v_cvt_f32_f64_e32 v4, v[4:5]
+# CHECK-NEXT: [0,3]     .  DeeeeeeeeeeeeeeeeeeeeeE    .    .    .    .    .    .    .    .    .   v_cvt_f64_f32_e32 v[6:7], v6
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeeeeeeeeeeeeeeE   .    .    .    .    .    .    .    .    .   v_cvt_u32_f64_e32 v8, v[8:9]
+# CHECK-NEXT: [0,5]     .    DeeeeeeeeeeeeeeeeeeeeeE  .    .    .    .    .    .    .    .    .   v_cvt_f64_u32_e32 v[10:11], v10
+# CHECK-NEXT: [0,6]     .    .    .    .    . DeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .   v_frexp_exp_i32_f64_e32 v0, v[0:1]
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeeeeeeeeeeeeeeeeeeeeE    .    .    .    .    .   v_frexp_mant_f64_e32 v[2:3], v[2:3]
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeeeeeeeeeeeeeeeeeeE   .    .    .    .    .   v_fract_f64_e32 v[4:5], v[4:5]
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .   DeeeeeeeeeeeeeeeeeeeeeE   .   v_trunc_f64_e32 v[0:1], v[0:1]
+# CHECK-NEXT: [0,10]    .    .    .    .    .    .    .    .    .    DeeeeeeeeeeeeeeeeeeeeeE  .   v_ceil_f64_e32 v[2:3], v[2:3]
+# CHECK-NEXT: [0,11]    .    .    .    .    .    .    .    .    .    .DeeeeeeeeeeeeeeeeeeeeeE .   v_rndne_f64_e32 v[4:5], v[4:5]
+# CHECK-NEXT: [0,12]    .    .    .    .    .    .    .    .    .    . DeeeeeeeeeeeeeeeeeeeeeE.   v_floor_f64_e32 v[6:7], v[6:7]
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions

diff  --git a/llvm/test/tools/llvm-mca/ARM/m7-negative-readadvance.s b/llvm/test/tools/llvm-mca/ARM/m7-negative-readadvance.s
index f195c069ef15..ad9a2b1ad0c2 100644
--- a/llvm/test/tools/llvm-mca/ARM/m7-negative-readadvance.s
+++ b/llvm/test/tools/llvm-mca/ARM/m7-negative-readadvance.s
@@ -9,12 +9,12 @@ vldr d0, [r1]
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      3
-# CHECK-NEXT: Total Cycles:      7
+# CHECK-NEXT: Total Cycles:      6
 # CHECK-NEXT: Total uOps:        3
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.43
-# CHECK-NEXT: IPC:               0.43
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 1.5
 
 # CHECK:      Instruction Info:
@@ -56,11 +56,11 @@ vldr d0, [r1]
 # CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     2.00   vldr	d0, [r1]
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     0123456
+# CHECK-NEXT: Index     012345
 
-# CHECK:      [0,0]     DER  ..   add.w	r1, r1, #1
-# CHECK-NEXT: [0,1]     .DER ..   add.w	r1, r1, #2
-# CHECK-NEXT: [0,2]     .  DeER   vldr	d0, [r1]
+# CHECK:      [0,0]     DE   .   add.w	r1, r1, #1
+# CHECK-NEXT: [0,1]     .DE  .   add.w	r1, r1, #2
+# CHECK-NEXT: [0,2]     .  DeE   vldr	d0, [r1]
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions

diff  --git a/llvm/tools/llvm-mca/Views/TimelineView.cpp b/llvm/tools/llvm-mca/Views/TimelineView.cpp
index c8b481bc7ce6..ceeb267cf119 100644
--- a/llvm/tools/llvm-mca/Views/TimelineView.cpp
+++ b/llvm/tools/llvm-mca/Views/TimelineView.cpp
@@ -77,8 +77,10 @@ void TimelineView::onEvent(const HWInstructionEvent &Event) {
            "Instruction cannot be ready if it hasn't been dispatched yet!");
     WTEntry.CyclesSpentInSQWhileReady +=
         TVEntry.CycleIssued - TVEntry.CycleReady;
-    WTEntry.CyclesSpentAfterWBAndBeforeRetire +=
-        (CurrentCycle - 1) - TVEntry.CycleExecuted;
+    if (CurrentCycle > TVEntry.CycleExecuted) {
+      WTEntry.CyclesSpentAfterWBAndBeforeRetire +=
+          (CurrentCycle - 1) - TVEntry.CycleExecuted;
+    }
     break;
   }
   case HWInstructionEvent::Ready:
@@ -243,7 +245,8 @@ void TimelineView::printTimelineViewEntry(formatted_raw_ostream &OS,
 
   for (unsigned I = Entry.CycleExecuted + 1, E = Entry.CycleRetired; I < E; ++I)
     OS << TimelineView::DisplayChar::RetireLag;
-  OS << TimelineView::DisplayChar::Retired;
+  if (Entry.CycleExecuted < Entry.CycleRetired)
+    OS << TimelineView::DisplayChar::Retired;
 
   // Skip other columns.
   for (unsigned I = Entry.CycleRetired + 1, E = LastCycle; I <= E; ++I)

diff  --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp
index 830a619d2e32..0e0a39883f93 100644
--- a/llvm/tools/llvm-mca/llvm-mca.cpp
+++ b/llvm/tools/llvm-mca/llvm-mca.cpp
@@ -278,7 +278,8 @@ static void processViewOptions(bool IsOutOfOrder) {
   processOptionImpl(PrintRegisterFileStats, Default);
   processOptionImpl(PrintDispatchStats, Default);
   processOptionImpl(PrintSchedulerStats, Default);
-  processOptionImpl(PrintRetireStats, Default);
+  if (IsOutOfOrder)
+    processOptionImpl(PrintRetireStats, Default);
 }
 
 // Returns true on success.


        


More information about the llvm-commits mailing list