[llvm] [llvm-mca][AArch64] Add AArch64 version of clearsSuperRegisters. (PR #92548)

Rin Dobrescu via llvm-commits llvm-commits at lists.llvm.org
Fri May 17 06:57:48 PDT 2024


https://github.com/Rin18 created https://github.com/llvm/llvm-project/pull/92548

This patch overrides the clearsSuperRegisters method defined in MCInstrAnalysis to identify register writes that clear the upper portion of all super-registers on AArch64 architecture.

On AArch64, a write to a general-purpose register of 32-bit data size is defined to use the lower 32-bits of the register and zero extend the upper 32-bits.
Similarly, SIMD and FP instructions operating on scalar data only access the lower bits of the SIMD&FP register. The unused upper bits are cleared to zero on a write.
This also applies to SIMD vector registers when the element size in bits multiplied by the number of lanes is lower than 128. The upper 64 bits of the vector register are cleared to zero on a write.

>From 7d13c3afd29035a26588e6c3e06bb348f1971375 Mon Sep 17 00:00:00 2001
From: Rin Dobrescu <rin.dobrescu at arm.com>
Date: Thu, 16 May 2024 11:26:09 +0000
Subject: [PATCH 1/2] Precommit tests.

---
 .../AArch64/Neoverse/V1-clear-upper-regs.s    | 792 ++++++++++++++++++
 .../AArch64/Neoverse/V2-clear-upper-regs.s    | 650 ++++++++++++++
 2 files changed, 1442 insertions(+)
 create mode 100644 llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
 create mode 100644 llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s

diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
new file mode 100644
index 0000000000000..197996d229b61
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
@@ -0,0 +1,792 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-v1 --timeline --timeline-max-iterations=4 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN GPR32-bit
+ldr w0, [sp]
+add x0, x0, x0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR8-bit
+ldr b0, [sp]
+fadd d0, d0, d0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR16-bit
+ldr h0, [sp]
+fadd d0, d0, d0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR32-bit
+ldr s0, [sp]
+fadd d0, d0, d0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-b
+ld1 {v0.8b}, [sp]
+add v0.16b, v0.16b, v0.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-h
+ld1 {v0.4h}, [sp]
+add v0.8h, v0.8h, v0.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-s
+ld1 {v0.2s}, [sp]
+add v0.4s, v0.4s, v0.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-d
+ld1 {v0.1d}, [sp]
+add v0.2d, v0.2d, v0.2d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN ins
+ins v0.b[0], v1.b[1]
+add v0.16b, v0.16b, v0.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN lanewise-load
+ld1 {v0.b}[0], [sp]
+add v0.16b, v0.16b, v0.16b
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region - GPR32-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      107
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.87
+# CHECK-NEXT: IPC:               1.87
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      4     0.33    *                   ldr	w0, [sp]
+# CHECK-NEXT:  1      1     0.25                        add	x0, x0, x0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -     0.50   0.50    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ldr	w0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     add	x0, x0, x0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .   ldr	w0, [sp]
+# CHECK-NEXT: [0,1]     D====eER  .   add	x0, x0, x0
+# CHECK-NEXT: [1,0]     DeeeeE-R  .   ldr	w0, [sp]
+# CHECK-NEXT: [1,1]     D=====eER .   add	x0, x0, x0
+# CHECK-NEXT: [2,0]     DeeeeE--R .   ldr	w0, [sp]
+# CHECK-NEXT: [2,1]     D======eER.   add	x0, x0, x0
+# CHECK-NEXT: [3,0]     D=eeeeE--R.   ldr	w0, [sp]
+# CHECK-NEXT: [3,1]     D=======eER   add	x0, x0, x0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	w0, [sp]
+# CHECK-NEXT: 1.     4     6.5    0.0    0.0       add	x0, x0, x0
+# CHECK-NEXT:        4     3.9    0.6    0.6       <total>
+
+# CHECK:      [1] Code Region - FPR8-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ldr	b0, [sp]
+# CHECK-NEXT:  1      2     0.25                        fadd	d0, d0, d0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ldr	b0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   fadd	d0, d0, d0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   ldr	b0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER    ..   fadd	d0, d0, d0
+# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ldr	b0, [sp]
+# CHECK-NEXT: [1,1]     D========eeER  ..   fadd	d0, d0, d0
+# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ldr	b0, [sp]
+# CHECK-NEXT: [2,1]     D==========eeER..   fadd	d0, d0, d0
+# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ldr	b0, [sp]
+# CHECK-NEXT: [3,1]     D============eeER   fadd	d0, d0, d0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ldr	b0, [sp]
+# CHECK-NEXT: 1.     4     10.0   0.0    0.0       fadd	d0, d0, d0
+# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+
+# CHECK:      [2] Code Region - FPR16-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ldr	h0, [sp]
+# CHECK-NEXT:  1      2     0.25                        fadd	d0, d0, d0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ldr	h0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   fadd	d0, d0, d0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   ldr	h0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER    ..   fadd	d0, d0, d0
+# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ldr	h0, [sp]
+# CHECK-NEXT: [1,1]     D========eeER  ..   fadd	d0, d0, d0
+# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ldr	h0, [sp]
+# CHECK-NEXT: [2,1]     D==========eeER..   fadd	d0, d0, d0
+# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ldr	h0, [sp]
+# CHECK-NEXT: [3,1]     D============eeER   fadd	d0, d0, d0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ldr	h0, [sp]
+# CHECK-NEXT: 1.     4     10.0   0.0    0.0       fadd	d0, d0, d0
+# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+
+# CHECK:      [3] Code Region - FPR32-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ldr	s0, [sp]
+# CHECK-NEXT:  1      2     0.25                        fadd	d0, d0, d0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ldr	s0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   fadd	d0, d0, d0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   ldr	s0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER    ..   fadd	d0, d0, d0
+# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ldr	s0, [sp]
+# CHECK-NEXT: [1,1]     D========eeER  ..   fadd	d0, d0, d0
+# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ldr	s0, [sp]
+# CHECK-NEXT: [2,1]     D==========eeER..   fadd	d0, d0, d0
+# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ldr	s0, [sp]
+# CHECK-NEXT: [3,1]     D============eeER   fadd	d0, d0, d0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ldr	s0, [sp]
+# CHECK-NEXT: 1.     4     10.0   0.0    0.0       fadd	d0, d0, d0
+# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+
+# CHECK:      [4] Code Region - SIMD64-bit-b
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ld1	{ v0.8b }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER    ..   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [1,1]     D========eeER  ..   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [2,1]     D==========eeER..   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [3,1]     D============eeER   add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+
+# CHECK:      [5] Code Region - SIMD64-bit-h
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	v0.8h, v0.8h, v0.8h
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ld1	{ v0.4h }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	v0.8h, v0.8h, v0.8h
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER    ..   add	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [1,1]     D========eeER  ..   add	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [2,1]     D==========eeER..   add	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [3,1]     D============eeER   add	v0.8h, v0.8h, v0.8h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+
+# CHECK:      [6] Code Region - SIMD64-bit-s
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	v0.4s, v0.4s, v0.4s
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ld1	{ v0.2s }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	v0.4s, v0.4s, v0.4s
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER    ..   add	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [1,1]     D========eeER  ..   add	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [2,1]     D==========eeER..   add	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [3,1]     D============eeER   add	v0.4s, v0.4s, v0.4s
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+
+# CHECK:      [7] Code Region - SIMD64-bit-d
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	v0.2d, v0.2d, v0.2d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ld1	{ v0.1d }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	v0.2d, v0.2d, v0.2d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER    ..   add	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [1,1]     D========eeER  ..   add	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [2,1]     D==========eeER..   add	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [3,1]     D============eeER   add	v0.2d, v0.2d, v0.2d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+
+# CHECK:      [8] Code Region - ins
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      403
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 0.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      2     0.25                        mov	v0.b[0], v1.b[1]
+# CHECK-NEXT:  1      2     0.25                        add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   0.50   0.50
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50   mov	v0.b[0], v1.b[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .    .  .   mov	v0.b[0], v1.b[1]
+# CHECK-NEXT: [0,1]     D==eeER   .    .  .   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [1,0]     D====eeER .    .  .   mov	v0.b[0], v1.b[1]
+# CHECK-NEXT: [1,1]     D======eeER    .  .   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [2,0]     D========eeER  .  .   mov	v0.b[0], v1.b[1]
+# CHECK-NEXT: [2,1]     D==========eeER.  .   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [3,0]     D============eeER .   mov	v0.b[0], v1.b[1]
+# CHECK-NEXT: [3,1]     D==============eeER   add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     7.0    0.3    0.0       mov	v0.b[0], v1.b[1]
+# CHECK-NEXT: 1.     4     9.0    0.0    0.0       add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:        4     8.0    0.1    0.0       <total>
+
+# CHECK:      [9] Code Region - lanewise-load
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      1003
+# CHECK-NEXT: Total uOps:        300
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.30
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 0.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      8     0.33    *                   ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT:  1      2     0.25                        add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50   0.50   0.50
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     0.50    -     0.50   ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT: [0,1]     D========eeER  .    .    .    .    .    . .   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [1,0]     D==========eeeeeeeeER    .    .    .    . .   ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT: [1,1]     D==================eeER  .    .    .    . .   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [2,0]     D====================eeeeeeeeER    .    . .   ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT: [2,1]     D============================eeER  .    . .   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [3,0]     D==============================eeeeeeeeER .   ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT: [3,1]     D======================================eeER   add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     16.0   0.3    0.0       ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT: 1.     4     24.0   0.0    0.0       add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:        4     20.0   0.1    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
new file mode 100644
index 0000000000000..708f22d54afd3
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
@@ -0,0 +1,650 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-v2 --timeline --timeline-max-iterations=4 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN FPR8-bit
+ldr b0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR16-bit
+ldr h0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR32-bit
+ldr s0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-b
+ld1 {v0.8b}, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-h
+ld1 {v0.4h}, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-s
+ld1 {v0.2s}, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-d
+ld1 {v0.1d}, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN insr
+insr z0.s, w0
+add z0.s, z0.s, z0.s
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region - FPR8-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ldr	b0, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ldr	b0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   ldr	b0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER    ..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ldr	b0, [sp]
+# CHECK-NEXT: [1,1]     D========eeER  ..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ldr	b0, [sp]
+# CHECK-NEXT: [2,1]     D==========eeER..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ldr	b0, [sp]
+# CHECK-NEXT: [3,1]     D============eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ldr	b0, [sp]
+# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+
+# CHECK:      [1] Code Region - FPR16-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ldr	h0, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ldr	h0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   ldr	h0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER    ..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ldr	h0, [sp]
+# CHECK-NEXT: [1,1]     D========eeER  ..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ldr	h0, [sp]
+# CHECK-NEXT: [2,1]     D==========eeER..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ldr	h0, [sp]
+# CHECK-NEXT: [3,1]     D============eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ldr	h0, [sp]
+# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+
+# CHECK:      [2] Code Region - FPR32-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ldr	s0, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ldr	s0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   ldr	s0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER    ..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ldr	s0, [sp]
+# CHECK-NEXT: [1,1]     D========eeER  ..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ldr	s0, [sp]
+# CHECK-NEXT: [2,1]     D==========eeER..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ldr	s0, [sp]
+# CHECK-NEXT: [3,1]     D============eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ldr	s0, [sp]
+# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+
+# CHECK:      [3] Code Region - SIMD64-bit-b
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ld1	{ v0.8b }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER    ..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [1,1]     D========eeER  ..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [2,1]     D==========eeER..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [3,1]     D============eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+
+# CHECK:      [4] Code Region - SIMD64-bit-h
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ld1	{ v0.4h }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER    ..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [1,1]     D========eeER  ..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [2,1]     D==========eeER..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [3,1]     D============eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+
+# CHECK:      [5] Code Region - SIMD64-bit-s
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ld1	{ v0.2s }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER    ..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [1,1]     D========eeER  ..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [2,1]     D==========eeER..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [3,1]     D============eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+
+# CHECK:      [6] Code Region - SIMD64-bit-d
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ld1	{ v0.1d }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER    ..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [1,1]     D========eeER  ..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [2,1]     D==========eeER..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [3,1]     D============eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+
+# CHECK:      [7] Code Region - insr
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      803
+# CHECK-NEXT: Total uOps:        300
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      6     1.00                        insr	z0.s, w0
+# CHECK-NEXT:  1      2     0.25                        add	z0.s, z0.s, z0.s
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -      -     0.33   1.00   0.33   0.34
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -      -      -     1.00    -      -     insr	z0.s, w0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.33    -     0.33   0.34   add	z0.s, z0.s, z0.s
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .   .   insr	z0.s, w0
+# CHECK-NEXT: [0,1]     D======eeER    .    .    .    .   .   add	z0.s, z0.s, z0.s
+# CHECK-NEXT: [1,0]     D========eeeeeeER   .    .    .   .   insr	z0.s, w0
+# CHECK-NEXT: [1,1]     D==============eeER .    .    .   .   add	z0.s, z0.s, z0.s
+# CHECK-NEXT: [2,0]     D================eeeeeeER.    .   .   insr	z0.s, w0
+# CHECK-NEXT: [2,1]     D======================eeER   .   .   add	z0.s, z0.s, z0.s
+# CHECK-NEXT: [3,0]     D========================eeeeeeER .   insr	z0.s, w0
+# CHECK-NEXT: [3,1]     D==============================eeER   add	z0.s, z0.s, z0.s
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     13.0   0.3    0.0       insr	z0.s, w0
+# CHECK-NEXT: 1.     4     19.0   0.0    0.0       add	z0.s, z0.s, z0.s
+# CHECK-NEXT:        4     16.0   0.1    0.0       <total>

>From a92c9ec38f97260f9a95486df10eebdbffabc716 Mon Sep 17 00:00:00 2001
From: Rin Dobrescu <rin.dobrescu at arm.com>
Date: Fri, 17 May 2024 11:10:15 +0000
Subject: [PATCH 2/2] [llvm-mca][AArch64] Add AArch64 version of
 clearsSuperRegisters.

---
 .../MCTargetDesc/AArch64MCTargetDesc.cpp      |  46 ++++
 .../AArch64/Neoverse/V1-clear-upper-regs.s    | 245 +++++++++---------
 .../AArch64/Neoverse/V2-clear-upper-regs.s    | 210 +++++++--------
 3 files changed, 273 insertions(+), 228 deletions(-)

diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 043f0a03b7975..d545cfefc319d 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -432,6 +432,52 @@ class AArch64MCInstrAnalysis : public MCInstrAnalysis {
     return false;
   }
 
+  bool clearsSuperRegisters(const MCRegisterInfo &MRI, const MCInst &Inst,
+                            APInt &Mask) const override {
+    const MCInstrDesc &Desc = Info->get(Inst.getOpcode());
+    unsigned NumDefs = Desc.getNumDefs();
+    unsigned NumImplicitDefs = Desc.implicit_defs().size();
+    assert(Mask.getBitWidth() == NumDefs + NumImplicitDefs &&
+           "Unexpected number of bits in the mask!");
+    // 32-bit General Purpose Register class.
+    const MCRegisterClass &GPR32RC = MRI.getRegClass(AArch64::GPR32RegClassID);
+    // Floating Point Register classes lower than 128-bit.
+    const MCRegisterClass &FPR8RC = MRI.getRegClass(AArch64::FPR8RegClassID);
+    const MCRegisterClass &FPR16RC = MRI.getRegClass(AArch64::FPR16RegClassID);
+    const MCRegisterClass &FPR32RC = MRI.getRegClass(AArch64::FPR32RegClassID);
+    const MCRegisterClass &FPR64RC = MRI.getRegClass(AArch64::FPR64RegClassID);
+
+    auto ClearsSuperReg = [=](unsigned RegID) {
+      // An update to the lower 32 bits of a 64 bit integer register is
+      // architecturally defined to zero extend the upper 32 bits on a write.
+      if (GPR32RC.contains(RegID))
+        return true;
+      // SIMD&FP instructions operating on scalar data only acccess the lower
+      // bits of a register, the upper bits are zero extended on a write. For
+      // SIMD vector registers smaller than 128-bits, the upper 64-bits of the
+      // register are zero extended on a write.
+      if (FPR8RC.contains(RegID) || FPR16RC.contains(RegID) ||
+          FPR32RC.contains(RegID) || FPR64RC.contains(RegID))
+        return true;
+      return false;
+    };
+
+    Mask.clearAllBits();
+    for (unsigned I = 0, E = NumDefs; I < E; ++I) {
+      const MCOperand &Op = Inst.getOperand(I);
+      if (ClearsSuperReg(Op.getReg()))
+        Mask.setBit(I);
+    }
+
+    for (unsigned I = 0, E = NumImplicitDefs; I < E; ++I) {
+      const MCPhysReg Reg = Desc.implicit_defs()[I];
+      if (ClearsSuperReg(Reg))
+        Mask.setBit(NumDefs + I);
+    }
+
+    return Mask.getBoolValue();
+  }
+
   std::vector<std::pair<uint64_t, uint64_t>>
   findPltEntries(uint64_t PltSectionVA, ArrayRef<uint8_t> PltContents,
                  const Triple &TargetTriple) const override {
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
index 197996d229b61..ab81f9fb04af3 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
@@ -55,12 +55,12 @@ add v0.16b, v0.16b, v0.16b
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      107
+# CHECK-NEXT: Total Cycles:      41
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    1.87
-# CHECK-NEXT: IPC:               1.87
+# CHECK-NEXT: uOps Per Cycle:    4.88
+# CHECK-NEXT: IPC:               4.88
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -94,25 +94,24 @@ add v0.16b, v0.16b, v0.16b
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
-# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -     0.50   0.50    -      -      -      -
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34   0.22   0.22   0.28   0.28    -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ldr	w0, [sp]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     add	x0, x0, x0
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.22   0.22   0.28   0.28    -      -      -      -     add	x0, x0, x0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DeeeeER   .   ldr	w0, [sp]
-# CHECK-NEXT: [0,1]     D====eER  .   add	x0, x0, x0
-# CHECK-NEXT: [1,0]     DeeeeE-R  .   ldr	w0, [sp]
-# CHECK-NEXT: [1,1]     D=====eER .   add	x0, x0, x0
-# CHECK-NEXT: [2,0]     DeeeeE--R .   ldr	w0, [sp]
-# CHECK-NEXT: [2,1]     D======eER.   add	x0, x0, x0
-# CHECK-NEXT: [3,0]     D=eeeeE--R.   ldr	w0, [sp]
-# CHECK-NEXT: [3,1]     D=======eER   add	x0, x0, x0
+# CHECK:      [0,0]     DeeeeER .   ldr	w0, [sp]
+# CHECK-NEXT: [0,1]     D====eER.   add	x0, x0, x0
+# CHECK-NEXT: [1,0]     DeeeeE-R.   ldr	w0, [sp]
+# CHECK-NEXT: [1,1]     D====eER.   add	x0, x0, x0
+# CHECK-NEXT: [2,0]     DeeeeE-R.   ldr	w0, [sp]
+# CHECK-NEXT: [2,1]     D====eER.   add	x0, x0, x0
+# CHECK-NEXT: [3,0]     D=eeeeER.   ldr	w0, [sp]
+# CHECK-NEXT: [3,1]     D=====eER   add	x0, x0, x0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -121,20 +120,20 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	w0, [sp]
-# CHECK-NEXT: 1.     4     6.5    0.0    0.0       add	x0, x0, x0
-# CHECK-NEXT:        4     3.9    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    0.5       ldr	w0, [sp]
+# CHECK-NEXT: 1.     4     5.3    0.0    0.0       add	x0, x0, x0
+# CHECK-NEXT:        4     3.3    0.6    0.3       <total>
 
 # CHECK:      [1] Code Region - FPR8-bit
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.96
-# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -176,17 +175,17 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   fadd	d0, d0, d0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ldr	b0, [sp]
-# CHECK-NEXT: [0,1]     D======eeER    ..   fadd	d0, d0, d0
-# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ldr	b0, [sp]
-# CHECK-NEXT: [1,1]     D========eeER  ..   fadd	d0, d0, d0
-# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ldr	b0, [sp]
-# CHECK-NEXT: [2,1]     D==========eeER..   fadd	d0, d0, d0
-# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ldr	b0, [sp]
-# CHECK-NEXT: [3,1]     D============eeER   fadd	d0, d0, d0
+# CHECK:      [0,0]     DeeeeeeER ..   ldr	b0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ldr	b0, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	b0, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	b0, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   fadd	d0, d0, d0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -195,20 +194,20 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ldr	b0, [sp]
-# CHECK-NEXT: 1.     4     10.0   0.0    0.0       fadd	d0, d0, d0
-# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	b0, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       fadd	d0, d0, d0
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
 
 # CHECK:      [2] Code Region - FPR16-bit
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.96
-# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -250,17 +249,17 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   fadd	d0, d0, d0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ldr	h0, [sp]
-# CHECK-NEXT: [0,1]     D======eeER    ..   fadd	d0, d0, d0
-# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ldr	h0, [sp]
-# CHECK-NEXT: [1,1]     D========eeER  ..   fadd	d0, d0, d0
-# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ldr	h0, [sp]
-# CHECK-NEXT: [2,1]     D==========eeER..   fadd	d0, d0, d0
-# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ldr	h0, [sp]
-# CHECK-NEXT: [3,1]     D============eeER   fadd	d0, d0, d0
+# CHECK:      [0,0]     DeeeeeeER ..   ldr	h0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ldr	h0, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	h0, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	h0, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   fadd	d0, d0, d0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -269,20 +268,20 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ldr	h0, [sp]
-# CHECK-NEXT: 1.     4     10.0   0.0    0.0       fadd	d0, d0, d0
-# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	h0, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       fadd	d0, d0, d0
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
 
 # CHECK:      [3] Code Region - FPR32-bit
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.96
-# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -324,17 +323,17 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   fadd	d0, d0, d0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ldr	s0, [sp]
-# CHECK-NEXT: [0,1]     D======eeER    ..   fadd	d0, d0, d0
-# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ldr	s0, [sp]
-# CHECK-NEXT: [1,1]     D========eeER  ..   fadd	d0, d0, d0
-# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ldr	s0, [sp]
-# CHECK-NEXT: [2,1]     D==========eeER..   fadd	d0, d0, d0
-# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ldr	s0, [sp]
-# CHECK-NEXT: [3,1]     D============eeER   fadd	d0, d0, d0
+# CHECK:      [0,0]     DeeeeeeER ..   ldr	s0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ldr	s0, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	s0, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	s0, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   fadd	d0, d0, d0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -343,20 +342,20 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ldr	s0, [sp]
-# CHECK-NEXT: 1.     4     10.0   0.0    0.0       fadd	d0, d0, d0
-# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	s0, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       fadd	d0, d0, d0
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
 
 # CHECK:      [4] Code Region - SIMD64-bit-b
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.96
-# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -398,17 +397,17 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	v0.16b, v0.16b, v0.16b
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.8b }, [sp]
-# CHECK-NEXT: [0,1]     D======eeER    ..   add	v0.16b, v0.16b, v0.16b
-# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.8b }, [sp]
-# CHECK-NEXT: [1,1]     D========eeER  ..   add	v0.16b, v0.16b, v0.16b
-# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.8b }, [sp]
-# CHECK-NEXT: [2,1]     D==========eeER..   add	v0.16b, v0.16b, v0.16b
-# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.8b }, [sp]
-# CHECK-NEXT: [3,1]     D============eeER   add	v0.16b, v0.16b, v0.16b
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	v0.16b, v0.16b, v0.16b
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -417,20 +416,20 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.8b }, [sp]
-# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	v0.16b, v0.16b, v0.16b
-# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
 
 # CHECK:      [5] Code Region - SIMD64-bit-h
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.96
-# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -472,17 +471,17 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	v0.8h, v0.8h, v0.8h
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.4h }, [sp]
-# CHECK-NEXT: [0,1]     D======eeER    ..   add	v0.8h, v0.8h, v0.8h
-# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.4h }, [sp]
-# CHECK-NEXT: [1,1]     D========eeER  ..   add	v0.8h, v0.8h, v0.8h
-# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.4h }, [sp]
-# CHECK-NEXT: [2,1]     D==========eeER..   add	v0.8h, v0.8h, v0.8h
-# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.4h }, [sp]
-# CHECK-NEXT: [3,1]     D============eeER   add	v0.8h, v0.8h, v0.8h
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	v0.8h, v0.8h, v0.8h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -491,20 +490,20 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.4h }, [sp]
-# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	v0.8h, v0.8h, v0.8h
-# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
 
 # CHECK:      [6] Code Region - SIMD64-bit-s
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.96
-# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -546,17 +545,17 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	v0.4s, v0.4s, v0.4s
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.2s }, [sp]
-# CHECK-NEXT: [0,1]     D======eeER    ..   add	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.2s }, [sp]
-# CHECK-NEXT: [1,1]     D========eeER  ..   add	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.2s }, [sp]
-# CHECK-NEXT: [2,1]     D==========eeER..   add	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.2s }, [sp]
-# CHECK-NEXT: [3,1]     D============eeER   add	v0.4s, v0.4s, v0.4s
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	v0.4s, v0.4s, v0.4s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -565,20 +564,20 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.2s }, [sp]
-# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
 
 # CHECK:      [7] Code Region - SIMD64-bit-d
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.96
-# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -620,17 +619,17 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	v0.2d, v0.2d, v0.2d
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.1d }, [sp]
-# CHECK-NEXT: [0,1]     D======eeER    ..   add	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.1d }, [sp]
-# CHECK-NEXT: [1,1]     D========eeER  ..   add	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.1d }, [sp]
-# CHECK-NEXT: [2,1]     D==========eeER..   add	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.1d }, [sp]
-# CHECK-NEXT: [3,1]     D============eeER   add	v0.2d, v0.2d, v0.2d
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	v0.2d, v0.2d, v0.2d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -639,9 +638,9 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.1d }, [sp]
-# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
 
 # CHECK:      [8] Code Region - ins
 
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
index 708f22d54afd3..20c97eeaa8733 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
@@ -45,12 +45,12 @@ add z0.s, z0.s, z0.s
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    0.96
-# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -94,17 +94,17 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ldr	b0, [sp]
-# CHECK-NEXT: [0,1]     D======eeER    ..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ldr	b0, [sp]
-# CHECK-NEXT: [1,1]     D========eeER  ..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ldr	b0, [sp]
-# CHECK-NEXT: [2,1]     D==========eeER..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ldr	b0, [sp]
-# CHECK-NEXT: [3,1]     D============eeER   add	z0.d, z0.d, z0.d
+# CHECK:      [0,0]     DeeeeeeER ..   ldr	b0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ldr	b0, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	b0, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	b0, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -113,20 +113,20 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ldr	b0, [sp]
-# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	b0, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
 
 # CHECK:      [1] Code Region - FPR16-bit
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    0.96
-# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -170,17 +170,17 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ldr	h0, [sp]
-# CHECK-NEXT: [0,1]     D======eeER    ..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ldr	h0, [sp]
-# CHECK-NEXT: [1,1]     D========eeER  ..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ldr	h0, [sp]
-# CHECK-NEXT: [2,1]     D==========eeER..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ldr	h0, [sp]
-# CHECK-NEXT: [3,1]     D============eeER   add	z0.d, z0.d, z0.d
+# CHECK:      [0,0]     DeeeeeeER ..   ldr	h0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ldr	h0, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	h0, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	h0, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -189,20 +189,20 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ldr	h0, [sp]
-# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	h0, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
 
 # CHECK:      [2] Code Region - FPR32-bit
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    0.96
-# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -246,17 +246,17 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ldr	s0, [sp]
-# CHECK-NEXT: [0,1]     D======eeER    ..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ldr	s0, [sp]
-# CHECK-NEXT: [1,1]     D========eeER  ..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ldr	s0, [sp]
-# CHECK-NEXT: [2,1]     D==========eeER..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ldr	s0, [sp]
-# CHECK-NEXT: [3,1]     D============eeER   add	z0.d, z0.d, z0.d
+# CHECK:      [0,0]     DeeeeeeER ..   ldr	s0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ldr	s0, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	s0, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	s0, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -265,20 +265,20 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ldr	s0, [sp]
-# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	s0, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
 
 # CHECK:      [3] Code Region - SIMD64-bit-b
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    0.96
-# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -322,17 +322,17 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.8b }, [sp]
-# CHECK-NEXT: [0,1]     D======eeER    ..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.8b }, [sp]
-# CHECK-NEXT: [1,1]     D========eeER  ..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.8b }, [sp]
-# CHECK-NEXT: [2,1]     D==========eeER..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.8b }, [sp]
-# CHECK-NEXT: [3,1]     D============eeER   add	z0.d, z0.d, z0.d
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -341,20 +341,20 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.8b }, [sp]
-# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
 
 # CHECK:      [4] Code Region - SIMD64-bit-h
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    0.96
-# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -398,17 +398,17 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.4h }, [sp]
-# CHECK-NEXT: [0,1]     D======eeER    ..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.4h }, [sp]
-# CHECK-NEXT: [1,1]     D========eeER  ..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.4h }, [sp]
-# CHECK-NEXT: [2,1]     D==========eeER..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.4h }, [sp]
-# CHECK-NEXT: [3,1]     D============eeER   add	z0.d, z0.d, z0.d
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -417,20 +417,20 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.4h }, [sp]
-# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
 
 # CHECK:      [5] Code Region - SIMD64-bit-s
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    0.96
-# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -474,17 +474,17 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.2s }, [sp]
-# CHECK-NEXT: [0,1]     D======eeER    ..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.2s }, [sp]
-# CHECK-NEXT: [1,1]     D========eeER  ..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.2s }, [sp]
-# CHECK-NEXT: [2,1]     D==========eeER..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.2s }, [sp]
-# CHECK-NEXT: [3,1]     D============eeER   add	z0.d, z0.d, z0.d
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -493,20 +493,20 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.2s }, [sp]
-# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
 
 # CHECK:      [6] Code Region - SIMD64-bit-d
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    0.96
-# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -550,17 +550,17 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.1d }, [sp]
-# CHECK-NEXT: [0,1]     D======eeER    ..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.1d }, [sp]
-# CHECK-NEXT: [1,1]     D========eeER  ..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.1d }, [sp]
-# CHECK-NEXT: [2,1]     D==========eeER..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.1d }, [sp]
-# CHECK-NEXT: [3,1]     D============eeER   add	z0.d, z0.d, z0.d
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -569,9 +569,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.1d }, [sp]
-# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
 
 # CHECK:      [7] Code Region - insr
 



More information about the llvm-commits mailing list