[llvm] 85e6e74 - [MCA] Switching from conservatively guessing which instructions are

Patrick Holland via llvm-commits llvm-commits at lists.llvm.org
Tue Jan 11 13:50:56 PST 2022


Author: Patrick Holland
Date: 2022-01-11T13:50:14-08:00
New Revision: 85e6e748d426f8992016914b07bc67c4da22e278

URL: https://github.com/llvm/llvm-project/commit/85e6e748d426f8992016914b07bc67c4da22e278
DIFF: https://github.com/llvm/llvm-project/commit/85e6e748d426f8992016914b07bc67c4da22e278.diff

LOG: [MCA] Switching from conservatively guessing which instructions are
memory-barrier instructions to providing targets and developers a convenient
way to explicitly declare which instructions are memory-barriers.

Differential Revision: https://reviews.llvm.org/D116779

Added: 
    llvm/lib/Target/X86/MCA/CMakeLists.txt
    llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp
    llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
    llvm/test/tools/llvm-mca/X86/barrier_output.s

Modified: 
    llvm/docs/CommandGuide/llvm-mca.rst
    llvm/include/llvm/MCA/CustomBehaviour.h
    llvm/include/llvm/MCA/Instruction.h
    llvm/lib/MCA/HardwareUnits/LSUnit.cpp
    llvm/lib/Target/X86/CMakeLists.txt
    llvm/test/tools/llvm-mca/AArch64/Cortex/A55-load-store-noalias.s
    llvm/test/tools/llvm-mca/AMDGPU/gfx9-retireooo.s
    llvm/test/tools/llvm-mca/X86/Barcelona/store-throughput.s
    llvm/test/tools/llvm-mca/X86/BdVer2/load-store-throughput.s
    llvm/test/tools/llvm-mca/X86/BdVer2/pr37790.s
    llvm/test/tools/llvm-mca/X86/BdVer2/store-throughput.s
    llvm/test/tools/llvm-mca/X86/BtVer2/pr37790.s
    llvm/test/tools/llvm-mca/X86/BtVer2/stmxcsr-ldmxcsr.s
    llvm/test/tools/llvm-mca/X86/Haswell/reserved-resources.s
    llvm/test/tools/llvm-mca/X86/Haswell/stmxcsr-ldmxcsr.s
    llvm/tools/llvm-mca/Views/InstructionInfoView.cpp
    llvm/tools/llvm-mca/Views/InstructionInfoView.h
    llvm/tools/llvm-mca/llvm-mca.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/docs/CommandGuide/llvm-mca.rst b/llvm/docs/CommandGuide/llvm-mca.rst
index d226936042ca4..b08f088762799 100644
--- a/llvm/docs/CommandGuide/llvm-mca.rst
+++ b/llvm/docs/CommandGuide/llvm-mca.rst
@@ -182,6 +182,11 @@ option specifies "``-``", then the output will also be sent to standard output.
 
   Enable the printing of instruction encodings within the instruction info view.
 
+.. option:: -show-barriers
+
+  Enable the printing of LoadBarrier and StoreBarrier flags within the
+  instruction info view.
+
 .. option:: -all-stats
 
   Print all hardware statistics. This enables extra statistics related to the
@@ -949,15 +954,16 @@ cache.  It only knows if an instruction "MayLoad" and/or "MayStore."  For
 loads, the scheduling model provides an "optimistic" load-to-use latency (which
 usually matches the load-to-use latency for when there is a hit in the L1D).
 
-:program:`llvm-mca` does not know about serializing operations or memory-barrier
-like instructions.  The LSUnit conservatively assumes that an instruction which
-has both "MayLoad" and unmodeled side effects behaves like a "soft"
-load-barrier.  That means, it serializes loads without forcing a flush of the
-load queue.  Similarly, instructions that "MayStore" and have unmodeled side
-effects are treated like store barriers.  A full memory barrier is a "MayLoad"
-and "MayStore" instruction with unmodeled side effects.  This is inaccurate, but
-it is the best that we can do at the moment with the current information
-available in LLVM.
+:program:`llvm-mca` does not (on its own) know about serializing operations or
+memory-barrier like instructions.  The LSUnit used to conservatively use an
+instruction's "MayLoad", "MayStore", and unmodeled side effects flags to
+determine whether an instruction should be treated as a memory-barrier. This was
+inaccurate in general and was changed so that now each instruction has an
+IsAStoreBarrier and IsALoadBarrier flag. These flags are mca specific and
+default to false for every instruction. If any instruction should have either of
+these flags set, it should be done within the target's InstrPostProcess class.
+For an example, look at the `X86InstrPostProcess::postProcessInstruction` method
+within `llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp`.
 
 A load/store barrier consumes one entry of the load/store queue.  A load/store
 barrier enforces ordering of loads/stores.  A younger load cannot pass a load

diff  --git a/llvm/include/llvm/MCA/CustomBehaviour.h b/llvm/include/llvm/MCA/CustomBehaviour.h
index 395b07cf722bd..5b993c6a53459 100644
--- a/llvm/include/llvm/MCA/CustomBehaviour.h
+++ b/llvm/include/llvm/MCA/CustomBehaviour.h
@@ -43,6 +43,10 @@ class InstrPostProcess {
 
   virtual ~InstrPostProcess() {}
 
+  /// This method can be overriden by targets to modify the mca::Instruction
+  /// object after it has been lowered from the MCInst.
+  /// This is generally a less disruptive alternative to modifying the
+  /// scheduling model.
   virtual void postProcessInstruction(std::unique_ptr<Instruction> &Inst,
                                       const MCInst &MCI) {}
 };

diff  --git a/llvm/include/llvm/MCA/Instruction.h b/llvm/include/llvm/MCA/Instruction.h
index 089c607749f1d..33e3c8a2e630e 100644
--- a/llvm/include/llvm/MCA/Instruction.h
+++ b/llvm/include/llvm/MCA/Instruction.h
@@ -517,9 +517,14 @@ class InstructionBase {
   // Instruction opcode which can be used by mca::CustomBehaviour
   unsigned Opcode;
 
+  // Flags used by the LSUnit.
+  bool IsALoadBarrier;
+  bool IsAStoreBarrier;
+
 public:
   InstructionBase(const InstrDesc &D, const unsigned Opcode)
-      : Desc(D), IsOptimizableMove(false), Operands(0), Opcode(Opcode) {}
+      : Desc(D), IsOptimizableMove(false), Operands(0), Opcode(Opcode),
+        IsALoadBarrier(false), IsAStoreBarrier(false) {}
 
   SmallVectorImpl<WriteState> &getDefs() { return Defs; }
   ArrayRef<WriteState> getDefs() const { return Defs; }
@@ -530,6 +535,10 @@ class InstructionBase {
   unsigned getLatency() const { return Desc.MaxLatency; }
   unsigned getNumMicroOps() const { return Desc.NumMicroOps; }
   unsigned getOpcode() const { return Opcode; }
+  bool isALoadBarrier() const { return IsALoadBarrier; }
+  bool isAStoreBarrier() const { return IsAStoreBarrier; }
+  void setLoadBarrier(bool IsBarrier) { IsALoadBarrier = IsBarrier; }
+  void setStoreBarrier(bool IsBarrier) { IsAStoreBarrier = IsBarrier; }
 
   /// Return the MCAOperand which corresponds to index Idx within the original
   /// MCInst.

diff  --git a/llvm/lib/MCA/HardwareUnits/LSUnit.cpp b/llvm/lib/MCA/HardwareUnits/LSUnit.cpp
index 07be7b077bc9d..121d320f10e61 100644
--- a/llvm/lib/MCA/HardwareUnits/LSUnit.cpp
+++ b/llvm/lib/MCA/HardwareUnits/LSUnit.cpp
@@ -68,7 +68,8 @@ void LSUnitBase::dump() const {
 
 unsigned LSUnit::dispatch(const InstRef &IR) {
   const InstrDesc &Desc = IR.getInstruction()->getDesc();
-  unsigned IsMemBarrier = Desc.HasSideEffects;
+  bool IsStoreBarrier = IR.getInstruction()->isAStoreBarrier();
+  bool IsLoadBarrier = IR.getInstruction()->isALoadBarrier();
   assert((Desc.MayLoad || Desc.MayStore) && "Not a memory operation!");
 
   if (Desc.MayLoad)
@@ -111,12 +112,12 @@ unsigned LSUnit::dispatch(const InstRef &IR) {
 
 
     CurrentStoreGroupID = NewGID;
-    if (IsMemBarrier)
+    if (IsStoreBarrier)
       CurrentStoreBarrierGroupID = NewGID;
 
     if (Desc.MayLoad) {
       CurrentLoadGroupID = NewGID;
-      if (IsMemBarrier)
+      if (IsLoadBarrier)
         CurrentLoadBarrierGroupID = NewGID;
     }
 
@@ -141,7 +142,7 @@ unsigned LSUnit::dispatch(const InstRef &IR) {
   //    However that group has already started execution, so we cannot add
   //    this load to it.
   bool ShouldCreateANewGroup =
-      IsMemBarrier || !ImmediateLoadDominator ||
+      IsLoadBarrier || !ImmediateLoadDominator ||
       CurrentLoadBarrierGroupID == ImmediateLoadDominator ||
       ImmediateLoadDominator <= CurrentStoreGroupID ||
       getGroup(ImmediateLoadDominator).isExecuting();
@@ -161,7 +162,7 @@ unsigned LSUnit::dispatch(const InstRef &IR) {
     }
 
     // A load barrier may not pass a previous load or load barrier.
-    if (IsMemBarrier) {
+    if (IsLoadBarrier) {
       if (ImmediateLoadDominator) {
         MemoryGroup &LoadGroup = getGroup(ImmediateLoadDominator);
         LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: ("
@@ -181,7 +182,7 @@ unsigned LSUnit::dispatch(const InstRef &IR) {
     }
 
     CurrentLoadGroupID = NewGID;
-    if (IsMemBarrier)
+    if (IsLoadBarrier)
       CurrentLoadBarrierGroupID = NewGID;
     return NewGID;
   }

diff  --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
index 7aca430a246b2..5dc4920f58245 100644
--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -109,5 +109,6 @@ add_llvm_target(X86CodeGen ${sources}
 
 add_subdirectory(AsmParser)
 add_subdirectory(Disassembler)
+add_subdirectory(MCA)
 add_subdirectory(MCTargetDesc)
 add_subdirectory(TargetInfo)

diff  --git a/llvm/lib/Target/X86/MCA/CMakeLists.txt b/llvm/lib/Target/X86/MCA/CMakeLists.txt
new file mode 100644
index 0000000000000..f481008cb45e0
--- /dev/null
+++ b/llvm/lib/Target/X86/MCA/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_llvm_component_library(LLVMX86TargetMCA
+  X86CustomBehaviour.cpp
+
+  LINK_COMPONENTS
+  MC
+  MCParser
+  X86Desc
+  X86Info
+  Support
+  MCA
+
+  ADD_TO_COMPONENT
+  X86
+  )

diff  --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp
new file mode 100644
index 0000000000000..78379290aae9e
--- /dev/null
+++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp
@@ -0,0 +1,64 @@
+//===------------------- X86CustomBehaviour.cpp -----------------*-C++ -* -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements methods from the X86CustomBehaviour class.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86CustomBehaviour.h"
+#include "TargetInfo/X86TargetInfo.h"
+#include "X86InstrInfo.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/WithColor.h"
+
+namespace llvm {
+namespace mca {
+
+void X86InstrPostProcess::setMemBarriers(std::unique_ptr<Instruction> &Inst,
+                                         const MCInst &MCI) {
+  switch (MCI.getOpcode()) {
+  case X86::MFENCE:
+    Inst->setLoadBarrier(true);
+    Inst->setStoreBarrier(true);
+    break;
+  case X86::LFENCE:
+    Inst->setLoadBarrier(true);
+    break;
+  case X86::SFENCE:
+    Inst->setStoreBarrier(true);
+    break;
+  }
+}
+
+void X86InstrPostProcess::postProcessInstruction(
+    std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
+  // Currently, we only modify certain instructions' IsALoadBarrier and
+  // IsAStoreBarrier flags.
+  setMemBarriers(Inst, MCI);
+}
+
+} // namespace mca
+} // namespace llvm
+
+using namespace llvm;
+using namespace mca;
+
+static InstrPostProcess *createX86InstrPostProcess(const MCSubtargetInfo &STI,
+                                                   const MCInstrInfo &MCII) {
+  return new X86InstrPostProcess(STI, MCII);
+}
+
+/// Extern function to initialize the targets for the X86 backend
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86TargetMCA() {
+  TargetRegistry::RegisterInstrPostProcess(getTheX86_32Target(),
+                                           createX86InstrPostProcess);
+  TargetRegistry::RegisterInstrPostProcess(getTheX86_64Target(),
+                                           createX86InstrPostProcess);
+}

diff  --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
new file mode 100644
index 0000000000000..24d26751f0a1d
--- /dev/null
+++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
@@ -0,0 +1,47 @@
+//===-------------------- X86CustomBehaviour.h ------------------*-C++ -* -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the X86CustomBehaviour class which inherits from
+/// CustomBehaviour. This class is used by the tool llvm-mca to enforce
+/// target specific behaviour that is not expressed well enough in the
+/// scheduling model for mca to enforce it automatically.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCA_X86CUSTOMBEHAVIOUR_H
+#define LLVM_LIB_TARGET_X86_MCA_X86CUSTOMBEHAVIOUR_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MCA/CustomBehaviour.h"
+#include "llvm/Support/TargetParser.h"
+
+namespace llvm {
+namespace mca {
+
+class X86InstrPostProcess : public InstrPostProcess {
+  void processWaitCnt(std::unique_ptr<Instruction> &Inst, const MCInst &MCI);
+
+  /// Called within X86InstrPostProcess to specify certain instructions
+  /// as load and store barriers.
+  void setMemBarriers(std::unique_ptr<Instruction> &Inst, const MCInst &MCI);
+
+public:
+  X86InstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
+      : InstrPostProcess(STI, MCII) {}
+
+  ~X86InstrPostProcess() {}
+
+  void postProcessInstruction(std::unique_ptr<Instruction> &Inst,
+                              const MCInst &MCI) override;
+};
+
+} // namespace mca
+} // namespace llvm
+
+#endif

diff  --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-load-store-noalias.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-load-store-noalias.s
index 9741b2a8a1758..dc681f4ce9479 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-load-store-noalias.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-load-store-noalias.s
@@ -10,12 +10,12 @@ ldr x3, [x10]
 
 # CHECK:      Iterations:        3
 # CHECK-NEXT: Instructions:      18
-# CHECK-NEXT: Total Cycles:      19
+# CHECK-NEXT: Total Cycles:      16
 # CHECK-NEXT: Total uOps:        18
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.95
-# CHECK-NEXT: IPC:               0.95
+# CHECK-NEXT: uOps Per Cycle:    1.13
+# CHECK-NEXT: IPC:               1.13
 # CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Instruction Info:
@@ -62,27 +62,27 @@ ldr x3, [x10]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     ldr	x3, [x10]
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345678
+# CHECK-NEXT:                     012345
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DE   .    .    .  .   str	x1, [x10]
-# CHECK-NEXT: [0,1]     .DE  .    .    .  .   str	x1, [x10]
-# CHECK-NEXT: [0,2]     .DeeE.    .    .  .   ldr	x2, [x10]
-# CHECK-NEXT: [0,3]     .  DE.    .    .  .   nop
-# CHECK-NEXT: [0,4]     .   DeeE  .    .  .   ldr	x2, [x10]
-# CHECK-NEXT: [0,5]     .    DeeE .    .  .   ldr	x3, [x10]
-# CHECK-NEXT: [1,0]     .    DE   .    .  .   str	x1, [x10]
-# CHECK-NEXT: [1,1]     .    .DE  .    .  .   str	x1, [x10]
-# CHECK-NEXT: [1,2]     .    .DeeE.    .  .   ldr	x2, [x10]
-# CHECK-NEXT: [1,3]     .    .  DE.    .  .   nop
-# CHECK-NEXT: [1,4]     .    .   DeeE  .  .   ldr	x2, [x10]
-# CHECK-NEXT: [1,5]     .    .    DeeE .  .   ldr	x3, [x10]
-# CHECK-NEXT: [2,0]     .    .    DE   .  .   str	x1, [x10]
-# CHECK-NEXT: [2,1]     .    .    .DE  .  .   str	x1, [x10]
-# CHECK-NEXT: [2,2]     .    .    .DeeE.  .   ldr	x2, [x10]
-# CHECK-NEXT: [2,3]     .    .    .  DE.  .   nop
-# CHECK-NEXT: [2,4]     .    .    .   DeeE.   ldr	x2, [x10]
-# CHECK-NEXT: [2,5]     .    .    .    DeeE   ldr	x3, [x10]
+# CHECK:      [0,0]     DE   .    .    .   str	x1, [x10]
+# CHECK-NEXT: [0,1]     .DE  .    .    .   str	x1, [x10]
+# CHECK-NEXT: [0,2]     .DeeE.    .    .   ldr	x2, [x10]
+# CHECK-NEXT: [0,3]     .  DE.    .    .   nop
+# CHECK-NEXT: [0,4]     .  DeeE   .    .   ldr	x2, [x10]
+# CHECK-NEXT: [0,5]     .   DeeE  .    .   ldr	x3, [x10]
+# CHECK-NEXT: [1,0]     .   DE    .    .   str	x1, [x10]
+# CHECK-NEXT: [1,1]     .    DE   .    .   str	x1, [x10]
+# CHECK-NEXT: [1,2]     .    DeeE .    .   ldr	x2, [x10]
+# CHECK-NEXT: [1,3]     .    . DE .    .   nop
+# CHECK-NEXT: [1,4]     .    . DeeE    .   ldr	x2, [x10]
+# CHECK-NEXT: [1,5]     .    .  DeeE   .   ldr	x3, [x10]
+# CHECK-NEXT: [2,0]     .    .  DE.    .   str	x1, [x10]
+# CHECK-NEXT: [2,1]     .    .   DE    .   str	x1, [x10]
+# CHECK-NEXT: [2,2]     .    .   DeeE  .   ldr	x2, [x10]
+# CHECK-NEXT: [2,3]     .    .    .DE  .   nop
+# CHECK-NEXT: [2,4]     .    .    .DeeE.   ldr	x2, [x10]
+# CHECK-NEXT: [2,5]     .    .    . DeeE   ldr	x3, [x10]
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions

diff  --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx9-retireooo.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx9-retireooo.s
index 939d3b0620131..706ed36f9e980 100644
--- a/llvm/test/tools/llvm-mca/AMDGPU/gfx9-retireooo.s
+++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx9-retireooo.s
@@ -40,12 +40,12 @@ s_waitcnt vmcnt(0) lgkmcnt(0)
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      36
-# CHECK-NEXT: Total Cycles:      331
+# CHECK-NEXT: Total Cycles:      94
 # CHECK-NEXT: Total uOps:        36
 
 # CHECK:      Dispatch Width:    1
-# CHECK-NEXT: uOps Per Cycle:    0.11
-# CHECK-NEXT: IPC:               0.11
+# CHECK-NEXT: uOps Per Cycle:    0.38
+# CHECK-NEXT: IPC:               0.38
 # CHECK-NEXT: Block RThroughput: 36.0
 
 # CHECK:      Instruction Info:
@@ -147,45 +147,45 @@ s_waitcnt vmcnt(0) lgkmcnt(0)
 # CHECK-NEXT:  -      -      -     1.00    -      -      -     s_waitcnt vmcnt(0) lgkmcnt(0)
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789
+# CHECK-NEXT:                     0123456789          0123456789          0123456789          0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789          0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   s_load_dwordx2 s[2:3], s[0:1], 0x24
-# CHECK-NEXT: [0,1]     .DeeeeE   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   s_load_dwordx2 s[0:1], s[0:1], 0x2c
-# CHECK-NEXT: [0,2]     .    .DE  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   s_waitcnt lgkmcnt(0)
-# CHECK-NEXT: [0,3]     .    . DE .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v0, s2
-# CHECK-NEXT: [0,4]     .    .  DE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v1, s3
-# CHECK-NEXT: [0,5]     .    .   DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   flat_load_dword v2, v[0:1]
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   flat_load_dword v3, v[0:1] offset:8
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   flat_load_dword v4, v[0:1] offset:16
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE.   flat_load_dword v5, v[0:1] offset:24
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    DE   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v0, s0
-# CHECK-NEXT: [0,10]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .DE  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v1, s1
-# CHECK-NEXT: [0,11]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    . DE .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v6, s6
-# CHECK-NEXT: [0,12]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  DE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v7, s7
-# CHECK-NEXT: [0,13]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   DE    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v8, s8
-# CHECK-NEXT: [0,14]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    DE   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v9, s9
-# CHECK-NEXT: [0,15]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .DE  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v10, s10
-# CHECK-NEXT: [0,16]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    . DE .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v11, s11
-# CHECK-NEXT: [0,17]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  DE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v12, s12
-# CHECK-NEXT: [0,18]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   DE    .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v13, s13
-# CHECK-NEXT: [0,19]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    DE   .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v14, s14
-# CHECK-NEXT: [0,20]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .DE  .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v15, s15
-# CHECK-NEXT: [0,21]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    . DE .    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v16, s16
-# CHECK-NEXT: [0,22]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  DE.    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v17, s17
-# CHECK-NEXT: [0,23]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   DE    .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v18, s18
-# CHECK-NEXT: [0,24]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    DE   .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v19, s19
-# CHECK-NEXT: [0,25]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .DE  .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v20, s20
-# CHECK-NEXT: [0,26]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    . DE .    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v21, s21
-# CHECK-NEXT: [0,27]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  DE.    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v22, s22
-# CHECK-NEXT: [0,28]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   DE    .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v23, s23
-# CHECK-NEXT: [0,29]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    DE   .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v24, s24
-# CHECK-NEXT: [0,30]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .DE  .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v25, s25
-# CHECK-NEXT: [0,31]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    . DE .    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v26, s26
-# CHECK-NEXT: [0,32]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  DE.    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v27, s27
-# CHECK-NEXT: [0,33]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   DE    .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v28, s28
-# CHECK-NEXT: [0,34]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    DE   .    .    .    .    .    .    .    .    .    .    .   v_mov_b32_e32 v29, s29
-# CHECK-NEXT: [0,35]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   DE   s_waitcnt vmcnt(0) lgkmcnt(0)
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   s_load_dwordx2 s[2:3], s[0:1], 0x24
+# CHECK-NEXT: [0,1]     .DeeeeE   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   s_load_dwordx2 s[0:1], s[0:1], 0x2c
+# CHECK-NEXT: [0,2]     .    .DE  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   s_waitcnt lgkmcnt(0)
+# CHECK-NEXT: [0,3]     .    . DE .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v0, s2
+# CHECK-NEXT: [0,4]     .    .  DE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v1, s3
+# CHECK-NEXT: [0,5]     .    .   DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE.  .   flat_load_dword v2, v[0:1]
+# CHECK-NEXT: [0,6]     .    .    DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE  .   flat_load_dword v3, v[0:1] offset:8
+# CHECK-NEXT: [0,7]     .    .    .DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE .   flat_load_dword v4, v[0:1] offset:16
+# CHECK-NEXT: [0,8]     .    .    . DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE.   flat_load_dword v5, v[0:1] offset:24
+# CHECK-NEXT: [0,9]     .    .    .  DE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v0, s0
+# CHECK-NEXT: [0,10]    .    .    .   DE    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v1, s1
+# CHECK-NEXT: [0,11]    .    .    .    DE   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v6, s6
+# CHECK-NEXT: [0,12]    .    .    .    .DE  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v7, s7
+# CHECK-NEXT: [0,13]    .    .    .    . DE .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v8, s8
+# CHECK-NEXT: [0,14]    .    .    .    .  DE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v9, s9
+# CHECK-NEXT: [0,15]    .    .    .    .   DE    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v10, s10
+# CHECK-NEXT: [0,16]    .    .    .    .    DE   .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v11, s11
+# CHECK-NEXT: [0,17]    .    .    .    .    .DE  .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v12, s12
+# CHECK-NEXT: [0,18]    .    .    .    .    . DE .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v13, s13
+# CHECK-NEXT: [0,19]    .    .    .    .    .  DE.    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v14, s14
+# CHECK-NEXT: [0,20]    .    .    .    .    .   DE    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v15, s15
+# CHECK-NEXT: [0,21]    .    .    .    .    .    DE   .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v16, s16
+# CHECK-NEXT: [0,22]    .    .    .    .    .    .DE  .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v17, s17
+# CHECK-NEXT: [0,23]    .    .    .    .    .    . DE .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v18, s18
+# CHECK-NEXT: [0,24]    .    .    .    .    .    .  DE.    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v19, s19
+# CHECK-NEXT: [0,25]    .    .    .    .    .    .   DE    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v20, s20
+# CHECK-NEXT: [0,26]    .    .    .    .    .    .    DE   .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v21, s21
+# CHECK-NEXT: [0,27]    .    .    .    .    .    .    .DE  .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v22, s22
+# CHECK-NEXT: [0,28]    .    .    .    .    .    .    . DE .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v23, s23
+# CHECK-NEXT: [0,29]    .    .    .    .    .    .    .  DE.    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v24, s24
+# CHECK-NEXT: [0,30]    .    .    .    .    .    .    .   DE    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v25, s25
+# CHECK-NEXT: [0,31]    .    .    .    .    .    .    .    DE   .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v26, s26
+# CHECK-NEXT: [0,32]    .    .    .    .    .    .    .    .DE  .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v27, s27
+# CHECK-NEXT: [0,33]    .    .    .    .    .    .    .    . DE .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v28, s28
+# CHECK-NEXT: [0,34]    .    .    .    .    .    .    .    .  DE.    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v29, s29
+# CHECK-NEXT: [0,35]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    . DE   s_waitcnt vmcnt(0) lgkmcnt(0)
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions

diff  --git a/llvm/test/tools/llvm-mca/X86/Barcelona/store-throughput.s b/llvm/test/tools/llvm-mca/X86/Barcelona/store-throughput.s
index 259a478ddd1e1..d9aabb783f1e7 100644
--- a/llvm/test/tools/llvm-mca/X86/Barcelona/store-throughput.s
+++ b/llvm/test/tools/llvm-mca/X86/Barcelona/store-throughput.s
@@ -528,10 +528,10 @@ movaps %xmm3, (%rbx)
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       movd	%mm0, (%rax)
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       movd	%mm1, (%rcx)
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       movd	%mm2, (%rdx)
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       movd	%mm3, (%rbx)
-# CHECK-NEXT:        1     2.5    0.3    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    1.0    0.0       movd	%mm1, (%rcx)
+# CHECK-NEXT: 2.     1     3.0    1.0    0.0       movd	%mm2, (%rdx)
+# CHECK-NEXT: 3.     1     4.0    1.0    0.0       movd	%mm3, (%rbx)
+# CHECK-NEXT:        1     2.5    1.0    0.0       <total>
 
 # CHECK:      [5] Code Region
 

diff  --git a/llvm/test/tools/llvm-mca/X86/BdVer2/load-store-throughput.s b/llvm/test/tools/llvm-mca/X86/BdVer2/load-store-throughput.s
index a7fbb2352c737..1259de2cdd8b1 100644
--- a/llvm/test/tools/llvm-mca/X86/BdVer2/load-store-throughput.s
+++ b/llvm/test/tools/llvm-mca/X86/BdVer2/load-store-throughput.s
@@ -519,12 +519,12 @@ movaps %xmm3, (%rbx)
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      553
+# CHECK-NEXT: Total Cycles:      405
 # CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.72
-# CHECK-NEXT: IPC:               0.72
+# CHECK-NEXT: uOps Per Cycle:    0.99
+# CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Instruction Info:
@@ -544,25 +544,24 @@ movaps %xmm3, (%rbx)
 # CHECK:      Dynamic Dispatch Stall Cycles:
 # CHECK-NEXT: RAT     - Register unavailable:                      0
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
-# CHECK-NEXT: SCHEDQ  - Scheduler full:                            57  (10.3%)
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            347  (85.7%)
 # CHECK-NEXT: LQ      - Load queue full:                           0
-# CHECK-NEXT: SQ      - Store queue full:                          432  (78.1%)
+# CHECK-NEXT: SQ      - Store queue full:                          0
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 # CHECK-NEXT: USH     - Uncategorised Structural Hazard:           0
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              364  (65.8%)
-# CHECK-NEXT:  1,              88  (15.9%)
-# CHECK-NEXT:  2,              4  (0.7%)
-# CHECK-NEXT:  3,              84  (15.2%)
-# CHECK-NEXT:  4,              13  (2.4%)
+# CHECK-NEXT:  0,              131  (32.3%)
+# CHECK-NEXT:  1,              174  (43.0%)
+# CHECK-NEXT:  2,              87  (21.5%)
+# CHECK-NEXT:  4,              13  (3.2%)
 
 # CHECK:      Schedulers - number of cycles where we saw N micro opcodes issued:
 # CHECK-NEXT: [# issued], [# cycles]
-# CHECK-NEXT:  0,          253  (45.8%)
-# CHECK-NEXT:  1,          200  (36.2%)
-# CHECK-NEXT:  2,          100  (18.1%)
+# CHECK-NEXT:  0,          105  (25.9%)
+# CHECK-NEXT:  1,          200  (49.4%)
+# CHECK-NEXT:  2,          100  (24.7%)
 
 # CHECK:      Scheduler's queue usage:
 # CHECK-NEXT: [1] Resource name.
@@ -571,10 +570,10 @@ movaps %xmm3, (%rbx)
 # CHECK-NEXT: [4] Total number of buffer entries.
 
 # CHECK:       [1]            [2]        [3]        [4]
-# CHECK-NEXT: PdEX             23         40         40
-# CHECK-NEXT: PdFPU            23         40         64
-# CHECK-NEXT: PdLoad           3          22         40
-# CHECK-NEXT: PdStore          22         24         24
+# CHECK-NEXT: PdEX             36         40         40
+# CHECK-NEXT: PdFPU            36         40         64
+# CHECK-NEXT: PdLoad           20         23         40
+# CHECK-NEXT: PdStore          19         22         24
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0.0] - PdAGLU01
@@ -608,8 +607,8 @@ movaps %xmm3, (%rbx)
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     3.00    -      -      -      -     1.00   movd	%mm0, (%rax)
-# CHECK-NEXT: 1.50   1.50    -      -      -      -      -      -      -      -      -     3.00    -      -      -     1.00    -      -      -      -     3.00    -      -     movd	(%rcx), %mm1
-# CHECK-NEXT: 1.50   1.50    -      -      -      -      -      -      -      -     3.00    -      -      -     1.00    -      -      -      -     3.00    -      -      -     movd	(%rdx), %mm2
+# CHECK-NEXT: 3.00    -      -      -      -      -      -      -      -      -      -     3.00    -      -      -     1.00    -      -      -      -     3.00    -      -     movd	(%rcx), %mm1
+# CHECK-NEXT:  -     3.00    -      -      -      -      -      -      -      -     3.00    -      -      -     1.00    -      -      -      -     3.00    -      -      -     movd	(%rdx), %mm2
 # CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     3.00    -      -      -      -      -     1.00   movd	%mm3, (%rbx)
 
 # CHECK:      Timeline view:
@@ -630,8 +629,8 @@ movaps %xmm3, (%rbx)
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       movd	%mm0, (%rax)
 # CHECK-NEXT: 1.     1     1.0    1.0    0.0       movd	(%rcx), %mm1
 # CHECK-NEXT: 2.     1     2.0    2.0    0.0       movd	(%rdx), %mm2
-# CHECK-NEXT: 3.     1     4.0    1.0    1.0       movd	%mm3, (%rbx)
-# CHECK-NEXT:        1     2.0    1.3    0.3       <total>
+# CHECK-NEXT: 3.     1     4.0    2.0    1.0       movd	%mm3, (%rbx)
+# CHECK-NEXT:        1     2.0    1.5    0.3       <total>
 
 # CHECK:      [5] Code Region
 

diff  --git a/llvm/test/tools/llvm-mca/X86/BdVer2/pr37790.s b/llvm/test/tools/llvm-mca/X86/BdVer2/pr37790.s
index 7490f71e5fa5d..5caf3b67f5b35 100644
--- a/llvm/test/tools/llvm-mca/X86/BdVer2/pr37790.s
+++ b/llvm/test/tools/llvm-mca/X86/BdVer2/pr37790.s
@@ -6,12 +6,12 @@ stmxcsr (%rsp)
 
 # CHECK:      Iterations:        2
 # CHECK-NEXT: Instructions:      4
-# CHECK-NEXT: Total Cycles:      205
+# CHECK-NEXT: Total Cycles:      103
 # CHECK-NEXT: Total uOps:        6
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.03
-# CHECK-NEXT: IPC:               0.02
+# CHECK-NEXT: uOps Per Cycle:    0.06
+# CHECK-NEXT: IPC:               0.04
 # CHECK-NEXT: Block RThroughput: 18.0
 
 # CHECK:      Instruction Info:
@@ -28,10 +28,12 @@ stmxcsr (%rsp)
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789          0123456789          0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789          0123456789          0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789          0123456789          0123456789          012
 
-# CHECK:      [0,0]     DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER.   int3
-# CHECK-NEXT: [0,1]     D====================================================================================================eER   stmxcsr	(%rsp)
+# CHECK:      [0,0]     DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER   int3
+# CHECK-NEXT: [0,1]     DeE---------------------------------------------------------------------------------------------------R   stmxcsr	(%rsp)
+# CHECK-NEXT: [1,0]     DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER   int3
+# CHECK-NEXT: [1,1]     .D=================eE---------------------------------------------------------------------------------R   stmxcsr	(%rsp)
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -40,6 +42,6 @@ stmxcsr (%rsp)
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     51.5   0.5    0.0       int3
-# CHECK-NEXT: 1.     2     151.0  0.0    0.0       stmxcsr	(%rsp)
-# CHECK-NEXT:        2     101.3  0.3    0.0       <total>
+# CHECK-NEXT: 0.     2     1.0    0.5    0.0       int3
+# CHECK-NEXT: 1.     2     9.5    9.0    90.0      stmxcsr	(%rsp)
+# CHECK-NEXT:        2     5.3    4.8    45.0      <total>

diff  --git a/llvm/test/tools/llvm-mca/X86/BdVer2/store-throughput.s b/llvm/test/tools/llvm-mca/X86/BdVer2/store-throughput.s
index e7e177bc9f6a2..0a2368f5f3f58 100644
--- a/llvm/test/tools/llvm-mca/X86/BdVer2/store-throughput.s
+++ b/llvm/test/tools/llvm-mca/X86/BdVer2/store-throughput.s
@@ -514,12 +514,12 @@ vmovaps %ymm3, (%rbx)
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      803
+# CHECK-NEXT: Total Cycles:      603
 # CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: uOps Per Cycle:    0.66
+# CHECK-NEXT: IPC:               0.66
 # CHECK-NEXT: Block RThroughput: 6.0
 
 # CHECK:      Instruction Info:
@@ -541,21 +541,21 @@ vmovaps %ymm3, (%rbx)
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
 # CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
 # CHECK-NEXT: LQ      - Load queue full:                           0
-# CHECK-NEXT: SQ      - Store queue full:                          748  (93.2%)
+# CHECK-NEXT: SQ      - Store queue full:                          560  (92.9%)
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 # CHECK-NEXT: USH     - Uncategorised Structural Hazard:           0
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              422  (52.6%)
-# CHECK-NEXT:  1,              374  (46.6%)
-# CHECK-NEXT:  2,              1  (0.1%)
-# CHECK-NEXT:  4,              6  (0.7%)
+# CHECK-NEXT:  0,              222  (36.8%)
+# CHECK-NEXT:  1,              374  (62.0%)
+# CHECK-NEXT:  2,              1  (0.2%)
+# CHECK-NEXT:  4,              6  (1.0%)
 
 # CHECK:      Schedulers - number of cycles where we saw N micro opcodes issued:
 # CHECK-NEXT: [# issued], [# cycles]
-# CHECK-NEXT:  0,          403  (50.2%)
-# CHECK-NEXT:  1,          400  (49.8%)
+# CHECK-NEXT:  0,          203  (33.7%)
+# CHECK-NEXT:  1,          400  (66.3%)
 
 # CHECK:      Scheduler's queue usage:
 # CHECK-NEXT: [1] Resource name.
@@ -564,8 +564,8 @@ vmovaps %ymm3, (%rbx)
 # CHECK-NEXT: [4] Total number of buffer entries.
 
 # CHECK:       [1]            [2]        [3]        [4]
-# CHECK-NEXT: PdEX             21         23         40
-# CHECK-NEXT: PdFPU            21         23         64
+# CHECK-NEXT: PdEX             21         22         40
+# CHECK-NEXT: PdFPU            21         22         64
 # CHECK-NEXT: PdLoad           0          0          40
 # CHECK-NEXT: PdStore          22         24         24
 
@@ -606,13 +606,12 @@ vmovaps %ymm3, (%rbx)
 # CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     3.00    -      -      -      -      -     1.00   movd	%mm3, (%rbx)
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DeeER.    .   movd	%mm0, (%rax)
-# CHECK-NEXT: [0,1]     D==eeER   .   movd	%mm1, (%rcx)
-# CHECK-NEXT: [0,2]     D====eeER .   movd	%mm2, (%rdx)
-# CHECK-NEXT: [0,3]     D======eeER   movd	%mm3, (%rbx)
+# CHECK:      [0,0]     DeeER.  .   movd	%mm0, (%rax)
+# CHECK-NEXT: [0,1]     D=eeER  .   movd	%mm1, (%rcx)
+# CHECK-NEXT: [0,2]     D===eeER.   movd	%mm2, (%rdx)
+# CHECK-NEXT: [0,3]     D====eeER   movd	%mm3, (%rbx)
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -622,10 +621,10 @@ vmovaps %ymm3, (%rbx)
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       movd	%mm0, (%rax)
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       movd	%mm1, (%rcx)
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       movd	%mm2, (%rdx)
-# CHECK-NEXT: 3.     1     7.0    0.0    0.0       movd	%mm3, (%rbx)
-# CHECK-NEXT:        1     4.0    0.3    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    1.0    0.0       movd	%mm1, (%rcx)
+# CHECK-NEXT: 2.     1     4.0    2.0    0.0       movd	%mm2, (%rdx)
+# CHECK-NEXT: 3.     1     5.0    1.0    0.0       movd	%mm3, (%rbx)
+# CHECK-NEXT:        1     3.0    1.3    0.0       <total>
 
 # CHECK:      [5] Code Region
 

diff  --git a/llvm/test/tools/llvm-mca/X86/BtVer2/pr37790.s b/llvm/test/tools/llvm-mca/X86/BtVer2/pr37790.s
index ee31bf4b8b18d..d91de1a399035 100644
--- a/llvm/test/tools/llvm-mca/X86/BtVer2/pr37790.s
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/pr37790.s
@@ -6,12 +6,12 @@ stmxcsr (%rsp)
 
 # CHECK:      Iterations:        2
 # CHECK-NEXT: Instructions:      4
-# CHECK-NEXT: Total Cycles:      205
+# CHECK-NEXT: Total Cycles:      104
 # CHECK-NEXT: Total uOps:        4
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.02
-# CHECK-NEXT: IPC:               0.02
+# CHECK-NEXT: uOps Per Cycle:    0.04
+# CHECK-NEXT: IPC:               0.04
 # CHECK-NEXT: Block RThroughput: 1.0
 
 # CHECK:      Instruction Info:
@@ -31,7 +31,9 @@ stmxcsr (%rsp)
 # CHECK-NEXT: Index     0123456789          0123456789          0123456789          0123456789          0123456789          0123
 
 # CHECK:      [0,0]     DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER.   int3
-# CHECK-NEXT: [0,1]     D====================================================================================================eER   stmxcsr	(%rsp)
+# CHECK-NEXT: [0,1]     DeE---------------------------------------------------------------------------------------------------R.   stmxcsr	(%rsp)
+# CHECK-NEXT: [1,0]     .DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER   int3
+# CHECK-NEXT: [1,1]     .DeE---------------------------------------------------------------------------------------------------R   stmxcsr	(%rsp)
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -40,6 +42,6 @@ stmxcsr (%rsp)
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     51.0   0.5    0.0       int3
-# CHECK-NEXT: 1.     2     151.0  0.0    0.0       stmxcsr	(%rsp)
-# CHECK-NEXT:        2     101.0  0.3    0.0       <total>
+# CHECK-NEXT: 0.     2     1.0    1.0    0.0       int3
+# CHECK-NEXT: 1.     2     1.0    0.0    99.0      stmxcsr	(%rsp)
+# CHECK-NEXT:        2     1.0    0.5    49.5      <total>

diff  --git a/llvm/test/tools/llvm-mca/X86/BtVer2/stmxcsr-ldmxcsr.s b/llvm/test/tools/llvm-mca/X86/BtVer2/stmxcsr-ldmxcsr.s
index 131bf50b77b51..d6163658f7454 100644
--- a/llvm/test/tools/llvm-mca/X86/BtVer2/stmxcsr-ldmxcsr.s
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/stmxcsr-ldmxcsr.s
@@ -12,12 +12,12 @@ retq
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      600
-# CHECK-NEXT: Total Cycles:      704
+# CHECK-NEXT: Total Cycles:      308
 # CHECK-NEXT: Total uOps:        600
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.85
-# CHECK-NEXT: IPC:               0.85
+# CHECK-NEXT: uOps Per Cycle:    1.95
+# CHECK-NEXT: IPC:               1.95
 # CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Instruction Info:
@@ -66,27 +66,27 @@ retq
 # CHECK-NEXT: 0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -     retq
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeER .    .    .    .   .   stmxcsr	-4(%rsp)
-# CHECK-NEXT: [0,1]     DeER .    .    .    .   .   movl	$-24577, %eax
-# CHECK-NEXT: [0,2]     .DeeeeER  .    .    .   .   andl	-4(%rsp), %eax
-# CHECK-NEXT: [0,3]     .D====eER .    .    .   .   movl	%eax, -8(%rsp)
-# CHECK-NEXT: [0,4]     . D===eeeER    .    .   .   ldmxcsr	-8(%rsp)
-# CHECK-NEXT: [0,5]     . DeeeeE--R    .    .   .   retq
-# CHECK-NEXT: [1,0]     .  D=====eER   .    .   .   stmxcsr	-4(%rsp)
-# CHECK-NEXT: [1,1]     .  DeE-----R   .    .   .   movl	$-24577, %eax
-# CHECK-NEXT: [1,2]     .   D====eeeeER.    .   .   andl	-4(%rsp), %eax
-# CHECK-NEXT: [1,3]     .   D========eER    .   .   movl	%eax, -8(%rsp)
-# CHECK-NEXT: [1,4]     .    D=======eeeER  .   .   ldmxcsr	-8(%rsp)
-# CHECK-NEXT: [1,5]     .    D=eeeeE-----R  .   .   retq
-# CHECK-NEXT: [2,0]     .    .D=========eER .   .   stmxcsr	-4(%rsp)
-# CHECK-NEXT: [2,1]     .    .DeE---------R .   .   movl	$-24577, %eax
-# CHECK-NEXT: [2,2]     .    . D========eeeeER  .   andl	-4(%rsp), %eax
-# CHECK-NEXT: [2,3]     .    . D============eER .   movl	%eax, -8(%rsp)
-# CHECK-NEXT: [2,4]     .    .  D===========eeeER   ldmxcsr	-8(%rsp)
-# CHECK-NEXT: [2,5]     .    .  D=eeeeE---------R   retq
+# CHECK:      [0,0]     DeER .    .    ..   stmxcsr	-4(%rsp)
+# CHECK-NEXT: [0,1]     DeER .    .    ..   movl	$-24577, %eax
+# CHECK-NEXT: [0,2]     .DeeeeER  .    ..   andl	-4(%rsp), %eax
+# CHECK-NEXT: [0,3]     .D====eER .    ..   movl	%eax, -8(%rsp)
+# CHECK-NEXT: [0,4]     . D===eeeER    ..   ldmxcsr	-8(%rsp)
+# CHECK-NEXT: [0,5]     . DeeeeE--R    ..   retq
+# CHECK-NEXT: [1,0]     .  D===eE--R   ..   stmxcsr	-4(%rsp)
+# CHECK-NEXT: [1,1]     .  DeE-----R   ..   movl	$-24577, %eax
+# CHECK-NEXT: [1,2]     .   DeeeeE--R  ..   andl	-4(%rsp), %eax
+# CHECK-NEXT: [1,3]     .   D====eE-R  ..   movl	%eax, -8(%rsp)
+# CHECK-NEXT: [1,4]     .    D===eeeER ..   ldmxcsr	-8(%rsp)
+# CHECK-NEXT: [1,5]     .    D=eeeeE-R ..   retq
+# CHECK-NEXT: [2,0]     .    .D===eE--R..   stmxcsr	-4(%rsp)
+# CHECK-NEXT: [2,1]     .    .DeE-----R..   movl	$-24577, %eax
+# CHECK-NEXT: [2,2]     .    . DeeeeE--R.   andl	-4(%rsp), %eax
+# CHECK-NEXT: [2,3]     .    . D====eE-R.   movl	%eax, -8(%rsp)
+# CHECK-NEXT: [2,4]     .    .  D===eeeER   ldmxcsr	-8(%rsp)
+# CHECK-NEXT: [2,5]     .    .  D=eeeeE-R   retq
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -95,10 +95,10 @@ retq
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     5.7    0.3    0.0       stmxcsr	-4(%rsp)
-# CHECK-NEXT: 1.     3     1.0    1.0    4.7       movl	$-24577, %eax
-# CHECK-NEXT: 2.     3     5.0    0.3    0.0       andl	-4(%rsp), %eax
-# CHECK-NEXT: 3.     3     9.0    0.0    0.0       movl	%eax, -8(%rsp)
-# CHECK-NEXT: 4.     3     8.0    0.0    0.0       ldmxcsr	-8(%rsp)
-# CHECK-NEXT: 5.     3     1.7    1.7    5.3       retq
-# CHECK-NEXT:        3     5.1    0.6    1.7       <total>
+# CHECK-NEXT: 0.     3     3.0    1.0    1.3       stmxcsr	-4(%rsp)
+# CHECK-NEXT: 1.     3     1.0    1.0    3.3       movl	$-24577, %eax
+# CHECK-NEXT: 2.     3     1.0    1.0    1.3       andl	-4(%rsp), %eax
+# CHECK-NEXT: 3.     3     5.0    0.0    0.7       movl	%eax, -8(%rsp)
+# CHECK-NEXT: 4.     3     4.0    0.0    0.0       ldmxcsr	-8(%rsp)
+# CHECK-NEXT: 5.     3     1.7    1.7    1.3       retq
+# CHECK-NEXT:        3     2.6    0.8    1.3       <total>

diff  --git a/llvm/test/tools/llvm-mca/X86/Haswell/reserved-resources.s b/llvm/test/tools/llvm-mca/X86/Haswell/reserved-resources.s
index 18acd1ef68638..b40322b9f3a39 100644
--- a/llvm/test/tools/llvm-mca/X86/Haswell/reserved-resources.s
+++ b/llvm/test/tools/llvm-mca/X86/Haswell/reserved-resources.s
@@ -5,11 +5,11 @@ fxrstor (%rsp)
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      100
-# CHECK-NEXT: Total Cycles:      6403
+# CHECK-NEXT: Total Cycles:      4720
 # CHECK-NEXT: Total uOps:        9000
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    1.41
+# CHECK-NEXT: uOps Per Cycle:    1.91
 # CHECK-NEXT: IPC:               0.02
 # CHECK-NEXT: Block RThroughput: 22.5
 

diff  --git a/llvm/test/tools/llvm-mca/X86/Haswell/stmxcsr-ldmxcsr.s b/llvm/test/tools/llvm-mca/X86/Haswell/stmxcsr-ldmxcsr.s
index 383160d219f08..9214e48cd39b6 100644
--- a/llvm/test/tools/llvm-mca/X86/Haswell/stmxcsr-ldmxcsr.s
+++ b/llvm/test/tools/llvm-mca/X86/Haswell/stmxcsr-ldmxcsr.s
@@ -12,12 +12,12 @@ retq
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      600
-# CHECK-NEXT: Total Cycles:      1304
+# CHECK-NEXT: Total Cycles:      413
 # CHECK-NEXT: Total uOps:        1300
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    1.00
-# CHECK-NEXT: IPC:               0.46
+# CHECK-NEXT: uOps Per Cycle:    3.15
+# CHECK-NEXT: IPC:               1.45
 # CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Instruction Info:
@@ -50,39 +50,39 @@ retq
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     1.74   1.74   1.67   1.68   2.00   1.74   1.78   1.65
+# CHECK-NEXT:  -      -     1.99   1.50   1.66   1.67   2.00   1.52   1.99   1.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
-# CHECK-NEXT:  -      -      -      -     0.30    -     1.00   1.00    -     0.70   stmxcsr	-4(%rsp)
-# CHECK-NEXT:  -      -     0.03   0.53    -      -      -     0.23   0.21    -     movl	$-24577, %eax
-# CHECK-NEXT:  -      -     0.22   0.58   0.35   0.65    -      -     0.20    -     andl	-4(%rsp), %eax
-# CHECK-NEXT:  -      -      -      -     0.05    -     1.00    -      -     0.95   movl	%eax, -8(%rsp)
-# CHECK-NEXT:  -      -     1.00   0.21   0.34   0.66    -     0.42   0.37    -     ldmxcsr	-8(%rsp)
-# CHECK-NEXT:  -      -     0.49   0.42   0.63   0.37    -     0.09   1.00    -     retq
+# CHECK-NEXT:  -      -      -      -     0.16    -     1.00   1.00    -     0.84   stmxcsr	-4(%rsp)
+# CHECK-NEXT:  -      -     0.49   0.49    -      -      -     0.01   0.01    -     movl	$-24577, %eax
+# CHECK-NEXT:  -      -     0.49   0.02   0.49   0.51    -     0.01   0.48    -     andl	-4(%rsp), %eax
+# CHECK-NEXT:  -      -      -      -     0.17    -     1.00    -      -     0.83   movl	%eax, -8(%rsp)
+# CHECK-NEXT:  -      -     1.00   0.01   0.33   0.67    -     0.49   0.50    -     ldmxcsr	-8(%rsp)
+# CHECK-NEXT:  -      -     0.01   0.98   0.51   0.49    -     0.01   1.00    -     retq
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          012
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234
 
-# CHECK:      [0,0]     DeeER.    .    .    .    .    .    .    . .   stmxcsr	-4(%rsp)
-# CHECK-NEXT: [0,1]     DeE-R.    .    .    .    .    .    .    . .   movl	$-24577, %eax
-# CHECK-NEXT: [0,2]     .DeeeeeeER.    .    .    .    .    .    . .   andl	-4(%rsp), %eax
-# CHECK-NEXT: [0,3]     .D======eER    .    .    .    .    .    . .   movl	%eax, -8(%rsp)
-# CHECK-NEXT: [0,4]     . D=====eeeeeeeER   .    .    .    .    . .   ldmxcsr	-8(%rsp)
-# CHECK-NEXT: [0,5]     .  DeeeeeeeE----R   .    .    .    .    . .   retq
-# CHECK-NEXT: [1,0]     .   D==========eeER .    .    .    .    . .   stmxcsr	-4(%rsp)
-# CHECK-NEXT: [1,1]     .   DeE-----------R .    .    .    .    . .   movl	$-24577, %eax
-# CHECK-NEXT: [1,2]     .    D=========eeeeeeER  .    .    .    . .   andl	-4(%rsp), %eax
-# CHECK-NEXT: [1,3]     .    D===============eER .    .    .    . .   movl	%eax, -8(%rsp)
-# CHECK-NEXT: [1,4]     .    .D==============eeeeeeeER.    .    . .   ldmxcsr	-8(%rsp)
-# CHECK-NEXT: [1,5]     .    . DeeeeeeeE-------------R.    .    . .   retq
-# CHECK-NEXT: [2,0]     .    .  D===================eeER   .    . .   stmxcsr	-4(%rsp)
-# CHECK-NEXT: [2,1]     .    .  DeE--------------------R   .    . .   movl	$-24577, %eax
-# CHECK-NEXT: [2,2]     .    .   D==================eeeeeeER    . .   andl	-4(%rsp), %eax
-# CHECK-NEXT: [2,3]     .    .   D========================eER   . .   movl	%eax, -8(%rsp)
-# CHECK-NEXT: [2,4]     .    .    D=======================eeeeeeeER   ldmxcsr	-8(%rsp)
-# CHECK-NEXT: [2,5]     .    .    .DeeeeeeeE----------------------R   retq
+# CHECK:      [0,0]     DeeER.    .    .    .   .   stmxcsr	-4(%rsp)
+# CHECK-NEXT: [0,1]     DeE-R.    .    .    .   .   movl	$-24577, %eax
+# CHECK-NEXT: [0,2]     .DeeeeeeER.    .    .   .   andl	-4(%rsp), %eax
+# CHECK-NEXT: [0,3]     .D======eER    .    .   .   movl	%eax, -8(%rsp)
+# CHECK-NEXT: [0,4]     . D=====eeeeeeeER   .   .   ldmxcsr	-8(%rsp)
+# CHECK-NEXT: [0,5]     .  DeeeeeeeE----R   .   .   retq
+# CHECK-NEXT: [1,0]     .   D====eeE----R   .   .   stmxcsr	-4(%rsp)
+# CHECK-NEXT: [1,1]     .   DeE---------R   .   .   movl	$-24577, %eax
+# CHECK-NEXT: [1,2]     .    DeeeeeeE---R   .   .   andl	-4(%rsp), %eax
+# CHECK-NEXT: [1,3]     .    D======eE--R   .   .   movl	%eax, -8(%rsp)
+# CHECK-NEXT: [1,4]     .    .D=====eeeeeeeER   .   ldmxcsr	-8(%rsp)
+# CHECK-NEXT: [1,5]     .    . D=eeeeeeeE---R   .   retq
+# CHECK-NEXT: [2,0]     .    .  D====eeE----R   .   stmxcsr	-4(%rsp)
+# CHECK-NEXT: [2,1]     .    .  DeE---------R   .   movl	$-24577, %eax
+# CHECK-NEXT: [2,2]     .    .   DeeeeeeE---R   .   andl	-4(%rsp), %eax
+# CHECK-NEXT: [2,3]     .    .   D======eE--R   .   movl	%eax, -8(%rsp)
+# CHECK-NEXT: [2,4]     .    .    D=====eeeeeeeER   ldmxcsr	-8(%rsp)
+# CHECK-NEXT: [2,5]     .    .    .DeeeeeeeE----R   retq
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -91,10 +91,10 @@ retq
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     10.7   0.3    0.0       stmxcsr	-4(%rsp)
-# CHECK-NEXT: 1.     3     1.0    1.0    10.7      movl	$-24577, %eax
-# CHECK-NEXT: 2.     3     10.0   0.3    0.0       andl	-4(%rsp), %eax
-# CHECK-NEXT: 3.     3     16.0   0.0    0.0       movl	%eax, -8(%rsp)
-# CHECK-NEXT: 4.     3     15.0   0.0    0.0       ldmxcsr	-8(%rsp)
-# CHECK-NEXT: 5.     3     1.0    1.0    13.0      retq
-# CHECK-NEXT:        3     8.9    0.4    3.9       <total>
+# CHECK-NEXT: 0.     3     3.7    1.0    2.7       stmxcsr	-4(%rsp)
+# CHECK-NEXT: 1.     3     1.0    1.0    6.3       movl	$-24577, %eax
+# CHECK-NEXT: 2.     3     1.0    1.0    2.0       andl	-4(%rsp), %eax
+# CHECK-NEXT: 3.     3     7.0    0.0    1.3       movl	%eax, -8(%rsp)
+# CHECK-NEXT: 4.     3     6.0    0.0    0.0       ldmxcsr	-8(%rsp)
+# CHECK-NEXT: 5.     3     1.3    1.3    3.7       retq
+# CHECK-NEXT:        3     3.3    0.7    2.7       <total>

diff  --git a/llvm/test/tools/llvm-mca/X86/barrier_output.s b/llvm/test/tools/llvm-mca/X86/barrier_output.s
new file mode 100644
index 0000000000000..dddfa044039fa
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/X86/barrier_output.s
@@ -0,0 +1,25 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -resource-pressure=false -summary-view=false -show-barriers < %s | FileCheck %s
+
+clflush (%rax)
+lfence
+mfence
+sfence
+maskmovdqu	%xmm0, %xmm1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: LoadBarrier
+# CHECK-NEXT: [8]: StoreBarrier
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    Instructions:
+# CHECK-NEXT:  4      5     1.00    *      *      U                   clflush	(%rax)
+# CHECK-NEXT:  1      1     1.00    *      *      U      *            lfence
+# CHECK-NEXT:  1      1     1.00    *      *      U      *      *     mfence
+# CHECK-NEXT:  1      1     1.00    *      *      U             *     sfence
+# CHECK-NEXT:  1      1     1.00    *      *      U                   maskmovdqu	%xmm0, %xmm1

diff  --git a/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp b/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp
index 3f6abf4af2cf6..caa8554a416ab 100644
--- a/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp
+++ b/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp
@@ -32,14 +32,30 @@ void InstructionInfoView::printView(raw_ostream &OS) const {
   TempStream << "\n\nInstruction Info:\n";
   TempStream << "[1]: #uOps\n[2]: Latency\n[3]: RThroughput\n"
              << "[4]: MayLoad\n[5]: MayStore\n[6]: HasSideEffects (U)\n";
+  if (PrintBarriers) {
+    TempStream << "[7]: LoadBarrier\n[8]: StoreBarrier\n";
+  }
   if (PrintEncodings) {
-    TempStream << "[7]: Encoding Size\n";
-    TempStream << "\n[1]    [2]    [3]    [4]    [5]    [6]    [7]    "
-               << "Encodings:                    Instructions:\n";
+    if (PrintBarriers) {
+      TempStream << "[9]: Encoding Size\n";
+      TempStream << "\n[1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    "
+                 << "[9]    Encodings:                    Instructions:\n";
+    } else {
+      TempStream << "[7]: Encoding Size\n";
+      TempStream << "\n[1]    [2]    [3]    [4]    [5]    [6]    [7]    "
+                 << "Encodings:                    Instructions:\n";
+    }
   } else {
-    TempStream << "\n[1]    [2]    [3]    [4]    [5]    [6]    Instructions:\n";
+    if (PrintBarriers) {
+      TempStream << "\n[1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    "
+                 << "Instructions:\n";
+    } else {
+      TempStream << "\n[1]    [2]    [3]    [4]    [5]    [6]    "
+                 << "Instructions:\n";
+    }
   }
 
+  int Index = 0;
   for (const auto &I : enumerate(zip(IIVD, Source))) {
     const InstructionInfoViewData &IIVDEntry = std::get<0>(I.value());
 
@@ -68,6 +84,13 @@ void InstructionInfoView::printView(raw_ostream &OS) const {
     TempStream << (IIVDEntry.mayStore ? " *     " : "       ");
     TempStream << (IIVDEntry.hasUnmodeledSideEffects ? " U     " : "       ");
 
+    if (PrintBarriers) {
+      TempStream << (LoweredInsts[Index]->isALoadBarrier() ? " *     "
+                                                           : "       ");
+      TempStream << (LoweredInsts[Index]->isAStoreBarrier() ? " *     "
+                                                            : "       ");
+    }
+
     if (PrintEncodings) {
       StringRef Encoding(CE.getEncoding(I.index()));
       unsigned EncodingSize = Encoding.size();
@@ -83,6 +106,7 @@ void InstructionInfoView::printView(raw_ostream &OS) const {
 
     const MCInst &Inst = std::get<1>(I.value());
     TempStream << printInstructionString(Inst) << '\n';
+    ++Index;
   }
 
   TempStream.flush();

diff  --git a/llvm/tools/llvm-mca/Views/InstructionInfoView.h b/llvm/tools/llvm-mca/Views/InstructionInfoView.h
index 5d52164e2d509..c35d316775f4b 100644
--- a/llvm/tools/llvm-mca/Views/InstructionInfoView.h
+++ b/llvm/tools/llvm-mca/Views/InstructionInfoView.h
@@ -54,6 +54,9 @@ class InstructionInfoView : public InstructionView {
   const llvm::MCInstrInfo &MCII;
   CodeEmitter &CE;
   bool PrintEncodings;
+  bool PrintBarriers;
+  using UniqueInst = std::unique_ptr<Instruction>;
+  ArrayRef<UniqueInst> LoweredInsts;
 
   struct InstructionInfoViewData {
     unsigned NumMicroOpcodes = 0;
@@ -72,9 +75,12 @@ class InstructionInfoView : public InstructionView {
   InstructionInfoView(const llvm::MCSubtargetInfo &ST,
                       const llvm::MCInstrInfo &II, CodeEmitter &C,
                       bool ShouldPrintEncodings, llvm::ArrayRef<llvm::MCInst> S,
-                      llvm::MCInstPrinter &IP)
+                      llvm::MCInstPrinter &IP,
+                      ArrayRef<UniqueInst> LoweredInsts,
+                      bool ShouldPrintBarriers)
       : InstructionView(ST, IP, S), MCII(II), CE(C),
-        PrintEncodings(ShouldPrintEncodings) {}
+        PrintEncodings(ShouldPrintEncodings),
+        PrintBarriers(ShouldPrintBarriers), LoweredInsts(LoweredInsts) {}
 
   void printView(llvm::raw_ostream &OS) const override;
   StringRef getNameAsString() const override { return "InstructionInfoView"; }

diff  --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp
index 0501336ab2077..1826491f3f305 100644
--- a/llvm/tools/llvm-mca/llvm-mca.cpp
+++ b/llvm/tools/llvm-mca/llvm-mca.cpp
@@ -219,6 +219,11 @@ static cl::opt<bool> ShowEncoding(
     cl::desc("Print encoding information in the instruction info view"),
     cl::cat(ViewOptions), cl::init(false));
 
+static cl::opt<bool> ShowBarriers(
+    "show-barriers",
+    cl::desc("Print memory barrier information in the instruction info view"),
+    cl::cat(ViewOptions), cl::init(false));
+
 static cl::opt<bool> DisableCustomBehaviour(
     "disable-cb",
     cl::desc(
@@ -504,7 +509,7 @@ int main(int argc, char **argv) {
       // (which does nothing).
       IPP = std::make_unique<mca::InstrPostProcess>(*STI, *MCII);
 
-    std::vector<std::unique_ptr<mca::Instruction>> LoweredSequence;
+    SmallVector<std::unique_ptr<mca::Instruction>> LoweredSequence;
     for (const MCInst &MCI : Insts) {
       Expected<std::unique_ptr<mca::Instruction>> Inst =
           IB.createInstruction(MCI);
@@ -548,7 +553,8 @@ int main(int argc, char **argv) {
       // Create the views for this pipeline, execute, and emit a report.
       if (PrintInstructionInfoView) {
         Printer.addView(std::make_unique<mca::InstructionInfoView>(
-            *STI, *MCII, CE, ShowEncoding, Insts, *IP));
+            *STI, *MCII, CE, ShowEncoding, Insts, *IP, LoweredSequence,
+            ShowBarriers));
       }
       Printer.addView(
           std::make_unique<mca::ResourcePressureView>(*STI, *IP, Insts));
@@ -624,7 +630,8 @@ int main(int argc, char **argv) {
 
     if (PrintInstructionInfoView)
       Printer.addView(std::make_unique<mca::InstructionInfoView>(
-          *STI, *MCII, CE, ShowEncoding, Insts, *IP));
+          *STI, *MCII, CE, ShowEncoding, Insts, *IP, LoweredSequence,
+          ShowBarriers));
 
     // Fetch custom Views that are to be placed after the InstructionInfoView.
     // Refer to the comment paired with the CB->getStartViews(*IP, Insts); line


        


More information about the llvm-commits mailing list