[llvm] 267de85 - [llvm-mca][AArch64] Add AArch64 version of clearsSuperRegisters. (#92548)
via llvm-commits
llvm-commits at lists.llvm.org
Wed May 22 07:31:41 PDT 2024
Author: Rin Dobrescu
Date: 2024-05-22T15:31:35+01:00
New Revision: 267de8543c8671baa7e12c4d181e6c4e6e2342cd
URL: https://github.com/llvm/llvm-project/commit/267de8543c8671baa7e12c4d181e6c4e6e2342cd
DIFF: https://github.com/llvm/llvm-project/commit/267de8543c8671baa7e12c4d181e6c4e6e2342cd.diff
LOG: [llvm-mca][AArch64] Add AArch64 version of clearsSuperRegisters. (#92548)
This patch overrides the clearsSuperRegisters method defined in
MCInstrAnalysis to identify register writes that clear the upper portion
of all super-registers on AArch64 architecture.
On AArch64, a write to a general-purpose register of 32-bit data size is
defined to use the lower 32-bits of the register and zero extend the
upper 32-bits.
Similarly, SIMD and FP instructions operating on scalar data only access
the lower bits of the SIMD&FP register. The unused upper bits are
cleared to zero on a write.
This also applies to SIMD vector registers when the element size in bits
multiplied by the number of lanes is lower than 128. The upper 64 bits
of the vector register are cleared to zero on a write.
Added:
llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
Modified:
llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 0dd4a78f962d4..6493a2ee4a938 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -430,6 +430,55 @@ class AArch64MCInstrAnalysis : public MCInstrAnalysis {
return false;
}
+ bool clearsSuperRegisters(const MCRegisterInfo &MRI, const MCInst &Inst,
+ APInt &Mask) const override {
+ const MCInstrDesc &Desc = Info->get(Inst.getOpcode());
+ unsigned NumDefs = Desc.getNumDefs();
+ unsigned NumImplicitDefs = Desc.implicit_defs().size();
+ assert(Mask.getBitWidth() == NumDefs + NumImplicitDefs &&
+ "Unexpected number of bits in the mask!");
+ // 32-bit General Purpose Register class.
+ const MCRegisterClass &GPR32RC = MRI.getRegClass(AArch64::GPR32RegClassID);
+ // Floating Point Register classes.
+ const MCRegisterClass &FPR8RC = MRI.getRegClass(AArch64::FPR8RegClassID);
+ const MCRegisterClass &FPR16RC = MRI.getRegClass(AArch64::FPR16RegClassID);
+ const MCRegisterClass &FPR32RC = MRI.getRegClass(AArch64::FPR32RegClassID);
+ const MCRegisterClass &FPR64RC = MRI.getRegClass(AArch64::FPR64RegClassID);
+ const MCRegisterClass &FPR128RC =
+ MRI.getRegClass(AArch64::FPR128RegClassID);
+
+ auto ClearsSuperReg = [=](unsigned RegID) {
+ // An update to the lower 32 bits of a 64 bit integer register is
+ // architecturally defined to zero extend the upper 32 bits on a write.
+ if (GPR32RC.contains(RegID))
+ return true;
+ // SIMD&FP instructions operating on scalar data only acccess the lower
+ // bits of a register, the upper bits are zero extended on a write. For
+ // SIMD vector registers smaller than 128-bits, the upper 64-bits of the
+ // register are zero extended on a write.
+ // When VL is higher than 128 bits, any write to a SIMD&FP register sets
+ // bits higher than 128 to zero.
+ return FPR8RC.contains(RegID) || FPR16RC.contains(RegID) ||
+ FPR32RC.contains(RegID) || FPR64RC.contains(RegID) ||
+ FPR128RC.contains(RegID);
+ };
+
+ Mask.clearAllBits();
+ for (unsigned I = 0, E = NumDefs; I < E; ++I) {
+ const MCOperand &Op = Inst.getOperand(I);
+ if (ClearsSuperReg(Op.getReg()))
+ Mask.setBit(I);
+ }
+
+ for (unsigned I = 0, E = NumImplicitDefs; I < E; ++I) {
+ const MCPhysReg Reg = Desc.implicit_defs()[I];
+ if (ClearsSuperReg(Reg))
+ Mask.setBit(NumDefs + I);
+ }
+
+ return Mask.getBoolValue();
+ }
+
std::vector<std::pair<uint64_t, uint64_t>>
findPltEntries(uint64_t PltSectionVA, ArrayRef<uint8_t> PltContents,
const Triple &TargetTriple) const override {
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
new file mode 100644
index 0000000000000..ab81f9fb04af3
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
@@ -0,0 +1,791 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-v1 --timeline --timeline-max-iterations=4 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN GPR32-bit
+ldr w0, [sp]
+add x0, x0, x0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR8-bit
+ldr b0, [sp]
+fadd d0, d0, d0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR16-bit
+ldr h0, [sp]
+fadd d0, d0, d0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR32-bit
+ldr s0, [sp]
+fadd d0, d0, d0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-b
+ld1 {v0.8b}, [sp]
+add v0.16b, v0.16b, v0.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-h
+ld1 {v0.4h}, [sp]
+add v0.8h, v0.8h, v0.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-s
+ld1 {v0.2s}, [sp]
+add v0.4s, v0.4s, v0.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-d
+ld1 {v0.1d}, [sp]
+add v0.2d, v0.2d, v0.2d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN ins
+ins v0.b[0], v1.b[1]
+add v0.16b, v0.16b, v0.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN lanewise-load
+ld1 {v0.b}[0], [sp]
+add v0.16b, v0.16b, v0.16b
+# LLVM-MCA-END
+
+# CHECK: [0] Code Region - GPR32-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 41
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 4.88
+# CHECK-NEXT: IPC: 4.88
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 4 0.33 * ldr w0, [sp]
+# CHECK-NEXT: 1 1 0.25 add x0, x0, x0
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 0.22 0.22 0.28 0.28 - - - -
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ldr w0, [sp]
+# CHECK-NEXT: - - - - - - - 0.22 0.22 0.28 0.28 - - - - add x0, x0, x0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: Index 012345678
+
+# CHECK: [0,0] DeeeeER . ldr w0, [sp]
+# CHECK-NEXT: [0,1] D====eER. add x0, x0, x0
+# CHECK-NEXT: [1,0] DeeeeE-R. ldr w0, [sp]
+# CHECK-NEXT: [1,1] D====eER. add x0, x0, x0
+# CHECK-NEXT: [2,0] DeeeeE-R. ldr w0, [sp]
+# CHECK-NEXT: [2,1] D====eER. add x0, x0, x0
+# CHECK-NEXT: [3,0] D=eeeeER. ldr w0, [sp]
+# CHECK-NEXT: [3,1] D=====eER add x0, x0, x0
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 0.5 ldr w0, [sp]
+# CHECK-NEXT: 1. 4 5.3 0.0 0.0 add x0, x0, x0
+# CHECK-NEXT: 4 3.3 0.6 0.3 <total>
+
+# CHECK: [1] Code Region - FPR8-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 44
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ldr b0, [sp]
+# CHECK-NEXT: 1 2 0.25 fadd d0, d0, d0
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ldr b0, [sp]
+# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 fadd d0, d0, d0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER .. ldr b0, [sp]
+# CHECK-NEXT: [0,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ldr b0, [sp]
+# CHECK-NEXT: [1,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr b0, [sp]
+# CHECK-NEXT: [2,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr b0, [sp]
+# CHECK-NEXT: [3,1] D=======eeER fadd d0, d0, d0
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr b0, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 fadd d0, d0, d0
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+
+# CHECK: [2] Code Region - FPR16-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 44
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ldr h0, [sp]
+# CHECK-NEXT: 1 2 0.25 fadd d0, d0, d0
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ldr h0, [sp]
+# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 fadd d0, d0, d0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER .. ldr h0, [sp]
+# CHECK-NEXT: [0,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ldr h0, [sp]
+# CHECK-NEXT: [1,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr h0, [sp]
+# CHECK-NEXT: [2,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr h0, [sp]
+# CHECK-NEXT: [3,1] D=======eeER fadd d0, d0, d0
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr h0, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 fadd d0, d0, d0
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+
+# CHECK: [3] Code Region - FPR32-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 44
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ldr s0, [sp]
+# CHECK-NEXT: 1 2 0.25 fadd d0, d0, d0
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ldr s0, [sp]
+# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 fadd d0, d0, d0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER .. ldr s0, [sp]
+# CHECK-NEXT: [0,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ldr s0, [sp]
+# CHECK-NEXT: [1,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr s0, [sp]
+# CHECK-NEXT: [2,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr s0, [sp]
+# CHECK-NEXT: [3,1] D=======eeER fadd d0, d0, d0
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr s0, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 fadd d0, d0, d0
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+
+# CHECK: [4] Code Region - SIMD64-bit-b
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 44
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ld1 { v0.8b }, [sp]
+# CHECK-NEXT: 1 2 0.25 add v0.16b, v0.16b, v0.16b
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ld1 { v0.8b }, [sp]
+# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 add v0.16b, v0.16b, v0.16b
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER .. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add v0.16b, v0.16b, v0.16b
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.8b }, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+
+# CHECK: [5] Code Region - SIMD64-bit-h
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 44
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ld1 { v0.4h }, [sp]
+# CHECK-NEXT: 1 2 0.25 add v0.8h, v0.8h, v0.8h
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ld1 { v0.4h }, [sp]
+# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 add v0.8h, v0.8h, v0.8h
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER .. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add v0.8h, v0.8h, v0.8h
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.4h }, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+
+# CHECK: [6] Code Region - SIMD64-bit-s
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 44
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ld1 { v0.2s }, [sp]
+# CHECK-NEXT: 1 2 0.25 add v0.4s, v0.4s, v0.4s
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ld1 { v0.2s }, [sp]
+# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 add v0.4s, v0.4s, v0.4s
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER .. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add v0.4s, v0.4s, v0.4s
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.2s }, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+
+# CHECK: [7] Code Region - SIMD64-bit-d
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 44
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ld1 { v0.1d }, [sp]
+# CHECK-NEXT: 1 2 0.25 add v0.2d, v0.2d, v0.2d
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ld1 { v0.1d }, [sp]
+# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 add v0.2d, v0.2d, v0.2d
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER .. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add v0.2d, v0.2d, v0.2d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.1d }, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+
+# CHECK: [8] Code Region - ins
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 403
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 0.50
+# CHECK-NEXT: IPC: 0.50
+# CHECK-NEXT: Block RThroughput: 0.5
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 2 0.25 mov v0.b[0], v1.b[1]
+# CHECK-NEXT: 1 2 0.25 add v0.16b, v0.16b, v0.16b
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 0.50 0.50
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - - - - - - - - - 0.50 - 0.50 mov v0.b[0], v1.b[1]
+# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - add v0.16b, v0.16b, v0.16b
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 012345678
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeER. . . . mov v0.b[0], v1.b[1]
+# CHECK-NEXT: [0,1] D==eeER . . . add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [1,0] D====eeER . . . mov v0.b[0], v1.b[1]
+# CHECK-NEXT: [1,1] D======eeER . . add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [2,0] D========eeER . . mov v0.b[0], v1.b[1]
+# CHECK-NEXT: [2,1] D==========eeER. . add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [3,0] D============eeER . mov v0.b[0], v1.b[1]
+# CHECK-NEXT: [3,1] D==============eeER add v0.16b, v0.16b, v0.16b
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 7.0 0.3 0.0 mov v0.b[0], v1.b[1]
+# CHECK-NEXT: 1. 4 9.0 0.0 0.0 add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: 4 8.0 0.1 0.0 <total>
+
+# CHECK: [9] Code Region - lanewise-load
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 1003
+# CHECK-NEXT: Total uOps: 300
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 0.30
+# CHECK-NEXT: IPC: 0.20
+# CHECK-NEXT: Block RThroughput: 0.5
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 2 8 0.33 * ld1 { v0.b }[0], [sp]
+# CHECK-NEXT: 1 2 0.25 add v0.16b, v0.16b, v0.16b
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 0.50 0.50
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - 0.50 - 0.50 ld1 { v0.b }[0], [sp]
+# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - add v0.16b, v0.16b, v0.16b
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456789
+# CHECK-NEXT: Index 0123456789 0123456789 012
+
+# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld1 { v0.b }[0], [sp]
+# CHECK-NEXT: [0,1] D========eeER . . . . . . . add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [1,0] D==========eeeeeeeeER . . . . . ld1 { v0.b }[0], [sp]
+# CHECK-NEXT: [1,1] D==================eeER . . . . . add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [2,0] D====================eeeeeeeeER . . . ld1 { v0.b }[0], [sp]
+# CHECK-NEXT: [2,1] D============================eeER . . . add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [3,0] D==============================eeeeeeeeER . ld1 { v0.b }[0], [sp]
+# CHECK-NEXT: [3,1] D======================================eeER add v0.16b, v0.16b, v0.16b
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 16.0 0.3 0.0 ld1 { v0.b }[0], [sp]
+# CHECK-NEXT: 1. 4 24.0 0.0 0.0 add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: 4 20.0 0.1 0.0 <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
new file mode 100644
index 0000000000000..fd2083dc1277a
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
@@ -0,0 +1,812 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-v2 --timeline --timeline-max-iterations=4 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN FPR8-bit
+ldr b0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR16-bit
+ldr h0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR32-bit
+ldr s0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR64-bit
+ldr d0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR128-bit
+ldr q0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-b
+ld1 {v0.8b}, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-h
+ld1 {v0.4h}, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-s
+ld1 {v0.2s}, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-d
+ld1 {v0.1d}, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN insr
+insr z0.s, w0
+add z0.s, z0.s, z0.s
+# LLVM-MCA-END
+
+# CHECK: [0] Code Region - FPR8-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 44
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ldr b0, [sp]
+# CHECK-NEXT: 1 2 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2] - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4] - V2UnitM0
+# CHECK-NEXT: [5] - V2UnitM1
+# CHECK-NEXT: [6] - V2UnitS0
+# CHECK-NEXT: [7] - V2UnitS1
+# CHECK-NEXT: [8] - V2UnitS2
+# CHECK-NEXT: [9] - V2UnitS3
+# CHECK-NEXT: [10] - V2UnitV0
+# CHECK-NEXT: [11] - V2UnitV1
+# CHECK-NEXT: [12] - V2UnitV2
+# CHECK-NEXT: [13] - V2UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - - - ldr b0, [sp]
+# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER .. ldr b0, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ldr b0, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr b0, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr b0, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr b0, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+
+# CHECK: [1] Code Region - FPR16-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 44
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ldr h0, [sp]
+# CHECK-NEXT: 1 2 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2] - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4] - V2UnitM0
+# CHECK-NEXT: [5] - V2UnitM1
+# CHECK-NEXT: [6] - V2UnitS0
+# CHECK-NEXT: [7] - V2UnitS1
+# CHECK-NEXT: [8] - V2UnitS2
+# CHECK-NEXT: [9] - V2UnitS3
+# CHECK-NEXT: [10] - V2UnitV0
+# CHECK-NEXT: [11] - V2UnitV1
+# CHECK-NEXT: [12] - V2UnitV2
+# CHECK-NEXT: [13] - V2UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - - - ldr h0, [sp]
+# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER .. ldr h0, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ldr h0, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr h0, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr h0, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr h0, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+
+# CHECK: [2] Code Region - FPR32-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 44
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ldr s0, [sp]
+# CHECK-NEXT: 1 2 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2] - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4] - V2UnitM0
+# CHECK-NEXT: [5] - V2UnitM1
+# CHECK-NEXT: [6] - V2UnitS0
+# CHECK-NEXT: [7] - V2UnitS1
+# CHECK-NEXT: [8] - V2UnitS2
+# CHECK-NEXT: [9] - V2UnitS3
+# CHECK-NEXT: [10] - V2UnitV0
+# CHECK-NEXT: [11] - V2UnitV1
+# CHECK-NEXT: [12] - V2UnitV2
+# CHECK-NEXT: [13] - V2UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - - - ldr s0, [sp]
+# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER .. ldr s0, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ldr s0, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr s0, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr s0, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr s0, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+
+# CHECK: [3] Code Region - FPR64-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 44
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ldr d0, [sp]
+# CHECK-NEXT: 1 2 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2] - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4] - V2UnitM0
+# CHECK-NEXT: [5] - V2UnitM1
+# CHECK-NEXT: [6] - V2UnitS0
+# CHECK-NEXT: [7] - V2UnitS1
+# CHECK-NEXT: [8] - V2UnitS2
+# CHECK-NEXT: [9] - V2UnitS3
+# CHECK-NEXT: [10] - V2UnitV0
+# CHECK-NEXT: [11] - V2UnitV1
+# CHECK-NEXT: [12] - V2UnitV2
+# CHECK-NEXT: [13] - V2UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - - - ldr d0, [sp]
+# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER .. ldr d0, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ldr d0, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr d0, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr d0, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr d0, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+
+# CHECK: [4] Code Region - FPR128-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 44
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ldr q0, [sp]
+# CHECK-NEXT: 1 2 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2] - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4] - V2UnitM0
+# CHECK-NEXT: [5] - V2UnitM1
+# CHECK-NEXT: [6] - V2UnitS0
+# CHECK-NEXT: [7] - V2UnitS1
+# CHECK-NEXT: [8] - V2UnitS2
+# CHECK-NEXT: [9] - V2UnitS3
+# CHECK-NEXT: [10] - V2UnitV0
+# CHECK-NEXT: [11] - V2UnitV1
+# CHECK-NEXT: [12] - V2UnitV2
+# CHECK-NEXT: [13] - V2UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - - - ldr q0, [sp]
+# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER .. ldr q0, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ldr q0, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr q0, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr q0, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr q0, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+
+# CHECK: [5] Code Region - SIMD64-bit-b
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 44
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ld1 { v0.8b }, [sp]
+# CHECK-NEXT: 1 2 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2] - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4] - V2UnitM0
+# CHECK-NEXT: [5] - V2UnitM1
+# CHECK-NEXT: [6] - V2UnitS0
+# CHECK-NEXT: [7] - V2UnitS1
+# CHECK-NEXT: [8] - V2UnitS2
+# CHECK-NEXT: [9] - V2UnitS3
+# CHECK-NEXT: [10] - V2UnitV0
+# CHECK-NEXT: [11] - V2UnitV1
+# CHECK-NEXT: [12] - V2UnitV2
+# CHECK-NEXT: [13] - V2UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - - - ld1 { v0.8b }, [sp]
+# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER .. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.8b }, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+
+# CHECK: [6] Code Region - SIMD64-bit-h
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 44
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ld1 { v0.4h }, [sp]
+# CHECK-NEXT: 1 2 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2] - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4] - V2UnitM0
+# CHECK-NEXT: [5] - V2UnitM1
+# CHECK-NEXT: [6] - V2UnitS0
+# CHECK-NEXT: [7] - V2UnitS1
+# CHECK-NEXT: [8] - V2UnitS2
+# CHECK-NEXT: [9] - V2UnitS3
+# CHECK-NEXT: [10] - V2UnitV0
+# CHECK-NEXT: [11] - V2UnitV1
+# CHECK-NEXT: [12] - V2UnitV2
+# CHECK-NEXT: [13] - V2UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - - - ld1 { v0.4h }, [sp]
+# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER .. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.4h }, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+
+# CHECK: [7] Code Region - SIMD64-bit-s
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 44
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ld1 { v0.2s }, [sp]
+# CHECK-NEXT: 1 2 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2] - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4] - V2UnitM0
+# CHECK-NEXT: [5] - V2UnitM1
+# CHECK-NEXT: [6] - V2UnitS0
+# CHECK-NEXT: [7] - V2UnitS1
+# CHECK-NEXT: [8] - V2UnitS2
+# CHECK-NEXT: [9] - V2UnitS3
+# CHECK-NEXT: [10] - V2UnitV0
+# CHECK-NEXT: [11] - V2UnitV1
+# CHECK-NEXT: [12] - V2UnitV2
+# CHECK-NEXT: [13] - V2UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - - - ld1 { v0.2s }, [sp]
+# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER .. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.2s }, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+
+# CHECK: [8] Code Region - SIMD64-bit-d
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 44
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ld1 { v0.1d }, [sp]
+# CHECK-NEXT: 1 2 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2] - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4] - V2UnitM0
+# CHECK-NEXT: [5] - V2UnitM1
+# CHECK-NEXT: [6] - V2UnitS0
+# CHECK-NEXT: [7] - V2UnitS1
+# CHECK-NEXT: [8] - V2UnitS2
+# CHECK-NEXT: [9] - V2UnitS3
+# CHECK-NEXT: [10] - V2UnitV0
+# CHECK-NEXT: [11] - V2UnitV1
+# CHECK-NEXT: [12] - V2UnitV2
+# CHECK-NEXT: [13] - V2UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - - - ld1 { v0.1d }, [sp]
+# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER .. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.1d }, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+
+# CHECK: [9] Code Region - insr
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 803
+# CHECK-NEXT: Total uOps: 300
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 0.37
+# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 2 6 1.00 insr z0.s, w0
+# CHECK-NEXT: 1 2 0.25 add z0.s, z0.s, z0.s
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2] - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4] - V2UnitM0
+# CHECK-NEXT: [5] - V2UnitM1
+# CHECK-NEXT: [6] - V2UnitS0
+# CHECK-NEXT: [7] - V2UnitS1
+# CHECK-NEXT: [8] - V2UnitS2
+# CHECK-NEXT: [9] - V2UnitS3
+# CHECK-NEXT: [10] - V2UnitV0
+# CHECK-NEXT: [11] - V2UnitV1
+# CHECK-NEXT: [12] - V2UnitV2
+# CHECK-NEXT: [13] - V2UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - - - - 1.00 - - - - - 0.33 1.00 0.33 0.34
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - - - - 1.00 - - - - - - 1.00 - - insr z0.s, w0
+# CHECK-NEXT: - - - - - - - - - - - - - 0.33 - 0.33 0.34 add z0.s, z0.s, z0.s
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 01234
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeeeER . . . . . . insr z0.s, w0
+# CHECK-NEXT: [0,1] D======eeER . . . . . add z0.s, z0.s, z0.s
+# CHECK-NEXT: [1,0] D========eeeeeeER . . . . insr z0.s, w0
+# CHECK-NEXT: [1,1] D==============eeER . . . . add z0.s, z0.s, z0.s
+# CHECK-NEXT: [2,0] D================eeeeeeER. . . insr z0.s, w0
+# CHECK-NEXT: [2,1] D======================eeER . . add z0.s, z0.s, z0.s
+# CHECK-NEXT: [3,0] D========================eeeeeeER . insr z0.s, w0
+# CHECK-NEXT: [3,1] D==============================eeER add z0.s, z0.s, z0.s
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 13.0 0.3 0.0 insr z0.s, w0
+# CHECK-NEXT: 1. 4 19.0 0.0 0.0 add z0.s, z0.s, z0.s
+# CHECK-NEXT: 4 16.0 0.1 0.0 <total>
More information about the llvm-commits
mailing list