[llvm] [llvm-mca][AArch64] Add AArch64 version of clearsSuperRegisters. (PR #92548)

Rin Dobrescu via llvm-commits llvm-commits at lists.llvm.org
Mon May 20 03:45:41 PDT 2024


https://github.com/Rin18 updated https://github.com/llvm/llvm-project/pull/92548

>From 7d13c3afd29035a26588e6c3e06bb348f1971375 Mon Sep 17 00:00:00 2001
From: Rin Dobrescu <rin.dobrescu at arm.com>
Date: Thu, 16 May 2024 11:26:09 +0000
Subject: [PATCH 1/3] Precommit tests.

---
 .../AArch64/Neoverse/V1-clear-upper-regs.s    | 792 ++++++++++++++++++
 .../AArch64/Neoverse/V2-clear-upper-regs.s    | 650 ++++++++++++++
 2 files changed, 1442 insertions(+)
 create mode 100644 llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
 create mode 100644 llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s

diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
new file mode 100644
index 0000000000000..197996d229b61
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
@@ -0,0 +1,792 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-v1 --timeline --timeline-max-iterations=4 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN GPR32-bit
+ldr w0, [sp]
+add x0, x0, x0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR8-bit
+ldr b0, [sp]
+fadd d0, d0, d0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR16-bit
+ldr h0, [sp]
+fadd d0, d0, d0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR32-bit
+ldr s0, [sp]
+fadd d0, d0, d0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-b
+ld1 {v0.8b}, [sp]
+add v0.16b, v0.16b, v0.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-h
+ld1 {v0.4h}, [sp]
+add v0.8h, v0.8h, v0.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-s
+ld1 {v0.2s}, [sp]
+add v0.4s, v0.4s, v0.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-d
+ld1 {v0.1d}, [sp]
+add v0.2d, v0.2d, v0.2d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN ins
+ins v0.b[0], v1.b[1]
+add v0.16b, v0.16b, v0.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN lanewise-load
+ld1 {v0.b}[0], [sp]
+add v0.16b, v0.16b, v0.16b
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region - GPR32-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      107
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.87
+# CHECK-NEXT: IPC:               1.87
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      4     0.33    *                   ldr	w0, [sp]
+# CHECK-NEXT:  1      1     0.25                        add	x0, x0, x0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -     0.50   0.50    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ldr	w0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     add	x0, x0, x0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .   ldr	w0, [sp]
+# CHECK-NEXT: [0,1]     D====eER  .   add	x0, x0, x0
+# CHECK-NEXT: [1,0]     DeeeeE-R  .   ldr	w0, [sp]
+# CHECK-NEXT: [1,1]     D=====eER .   add	x0, x0, x0
+# CHECK-NEXT: [2,0]     DeeeeE--R .   ldr	w0, [sp]
+# CHECK-NEXT: [2,1]     D======eER.   add	x0, x0, x0
+# CHECK-NEXT: [3,0]     D=eeeeE--R.   ldr	w0, [sp]
+# CHECK-NEXT: [3,1]     D=======eER   add	x0, x0, x0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	w0, [sp]
+# CHECK-NEXT: 1.     4     6.5    0.0    0.0       add	x0, x0, x0
+# CHECK-NEXT:        4     3.9    0.6    0.6       <total>
+
+# CHECK:      [1] Code Region - FPR8-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ldr	b0, [sp]
+# CHECK-NEXT:  1      2     0.25                        fadd	d0, d0, d0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ldr	b0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   fadd	d0, d0, d0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   ldr	b0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER    ..   fadd	d0, d0, d0
+# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ldr	b0, [sp]
+# CHECK-NEXT: [1,1]     D========eeER  ..   fadd	d0, d0, d0
+# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ldr	b0, [sp]
+# CHECK-NEXT: [2,1]     D==========eeER..   fadd	d0, d0, d0
+# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ldr	b0, [sp]
+# CHECK-NEXT: [3,1]     D============eeER   fadd	d0, d0, d0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ldr	b0, [sp]
+# CHECK-NEXT: 1.     4     10.0   0.0    0.0       fadd	d0, d0, d0
+# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+
+# CHECK:      [2] Code Region - FPR16-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ldr	h0, [sp]
+# CHECK-NEXT:  1      2     0.25                        fadd	d0, d0, d0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ldr	h0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   fadd	d0, d0, d0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   ldr	h0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER    ..   fadd	d0, d0, d0
+# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ldr	h0, [sp]
+# CHECK-NEXT: [1,1]     D========eeER  ..   fadd	d0, d0, d0
+# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ldr	h0, [sp]
+# CHECK-NEXT: [2,1]     D==========eeER..   fadd	d0, d0, d0
+# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ldr	h0, [sp]
+# CHECK-NEXT: [3,1]     D============eeER   fadd	d0, d0, d0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ldr	h0, [sp]
+# CHECK-NEXT: 1.     4     10.0   0.0    0.0       fadd	d0, d0, d0
+# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+
+# CHECK:      [3] Code Region - FPR32-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ldr	s0, [sp]
+# CHECK-NEXT:  1      2     0.25                        fadd	d0, d0, d0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ldr	s0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   fadd	d0, d0, d0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   ldr	s0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER    ..   fadd	d0, d0, d0
+# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ldr	s0, [sp]
+# CHECK-NEXT: [1,1]     D========eeER  ..   fadd	d0, d0, d0
+# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ldr	s0, [sp]
+# CHECK-NEXT: [2,1]     D==========eeER..   fadd	d0, d0, d0
+# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ldr	s0, [sp]
+# CHECK-NEXT: [3,1]     D============eeER   fadd	d0, d0, d0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ldr	s0, [sp]
+# CHECK-NEXT: 1.     4     10.0   0.0    0.0       fadd	d0, d0, d0
+# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+
+# CHECK:      [4] Code Region - SIMD64-bit-b
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ld1	{ v0.8b }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER    ..   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [1,1]     D========eeER  ..   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [2,1]     D==========eeER..   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [3,1]     D============eeER   add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+
+# CHECK:      [5] Code Region - SIMD64-bit-h
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	v0.8h, v0.8h, v0.8h
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ld1	{ v0.4h }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	v0.8h, v0.8h, v0.8h
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER    ..   add	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [1,1]     D========eeER  ..   add	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [2,1]     D==========eeER..   add	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [3,1]     D============eeER   add	v0.8h, v0.8h, v0.8h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+
+# CHECK:      [6] Code Region - SIMD64-bit-s
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	v0.4s, v0.4s, v0.4s
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ld1	{ v0.2s }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	v0.4s, v0.4s, v0.4s
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER    ..   add	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [1,1]     D========eeER  ..   add	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [2,1]     D==========eeER..   add	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [3,1]     D============eeER   add	v0.4s, v0.4s, v0.4s
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+
+# CHECK:      [7] Code Region - SIMD64-bit-d
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	v0.2d, v0.2d, v0.2d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ld1	{ v0.1d }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	v0.2d, v0.2d, v0.2d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER    ..   add	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [1,1]     D========eeER  ..   add	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [2,1]     D==========eeER..   add	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [3,1]     D============eeER   add	v0.2d, v0.2d, v0.2d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+
+# CHECK:      [8] Code Region - ins
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      403
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 0.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      2     0.25                        mov	v0.b[0], v1.b[1]
+# CHECK-NEXT:  1      2     0.25                        add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   0.50   0.50
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50   mov	v0.b[0], v1.b[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .    .  .   mov	v0.b[0], v1.b[1]
+# CHECK-NEXT: [0,1]     D==eeER   .    .  .   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [1,0]     D====eeER .    .  .   mov	v0.b[0], v1.b[1]
+# CHECK-NEXT: [1,1]     D======eeER    .  .   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [2,0]     D========eeER  .  .   mov	v0.b[0], v1.b[1]
+# CHECK-NEXT: [2,1]     D==========eeER.  .   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [3,0]     D============eeER .   mov	v0.b[0], v1.b[1]
+# CHECK-NEXT: [3,1]     D==============eeER   add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     7.0    0.3    0.0       mov	v0.b[0], v1.b[1]
+# CHECK-NEXT: 1.     4     9.0    0.0    0.0       add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:        4     8.0    0.1    0.0       <total>
+
+# CHECK:      [9] Code Region - lanewise-load
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      1003
+# CHECK-NEXT: Total uOps:        300
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.30
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 0.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      8     0.33    *                   ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT:  1      2     0.25                        add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50   0.50   0.50
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     0.50    -     0.50   ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT: [0,1]     D========eeER  .    .    .    .    .    . .   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [1,0]     D==========eeeeeeeeER    .    .    .    . .   ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT: [1,1]     D==================eeER  .    .    .    . .   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [2,0]     D====================eeeeeeeeER    .    . .   ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT: [2,1]     D============================eeER  .    . .   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [3,0]     D==============================eeeeeeeeER .   ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT: [3,1]     D======================================eeER   add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     16.0   0.3    0.0       ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT: 1.     4     24.0   0.0    0.0       add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:        4     20.0   0.1    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
new file mode 100644
index 0000000000000..708f22d54afd3
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
@@ -0,0 +1,650 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-v2 --timeline --timeline-max-iterations=4 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN FPR8-bit
+ldr b0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR16-bit
+ldr h0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR32-bit
+ldr s0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-b
+ld1 {v0.8b}, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-h
+ld1 {v0.4h}, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-s
+ld1 {v0.2s}, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-d
+ld1 {v0.1d}, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN insr
+insr z0.s, w0
+add z0.s, z0.s, z0.s
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region - FPR8-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ldr	b0, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ldr	b0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   ldr	b0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER    ..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ldr	b0, [sp]
+# CHECK-NEXT: [1,1]     D========eeER  ..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ldr	b0, [sp]
+# CHECK-NEXT: [2,1]     D==========eeER..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ldr	b0, [sp]
+# CHECK-NEXT: [3,1]     D============eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ldr	b0, [sp]
+# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+
+# CHECK:      [1] Code Region - FPR16-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ldr	h0, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ldr	h0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   ldr	h0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER    ..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ldr	h0, [sp]
+# CHECK-NEXT: [1,1]     D========eeER  ..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ldr	h0, [sp]
+# CHECK-NEXT: [2,1]     D==========eeER..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ldr	h0, [sp]
+# CHECK-NEXT: [3,1]     D============eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ldr	h0, [sp]
+# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+
+# CHECK:      [2] Code Region - FPR32-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ldr	s0, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ldr	s0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   ldr	s0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER    ..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ldr	s0, [sp]
+# CHECK-NEXT: [1,1]     D========eeER  ..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ldr	s0, [sp]
+# CHECK-NEXT: [2,1]     D==========eeER..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ldr	s0, [sp]
+# CHECK-NEXT: [3,1]     D============eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ldr	s0, [sp]
+# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+
+# CHECK:      [3] Code Region - SIMD64-bit-b
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ld1	{ v0.8b }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER    ..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [1,1]     D========eeER  ..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [2,1]     D==========eeER..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [3,1]     D============eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+
+# CHECK:      [4] Code Region - SIMD64-bit-h
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ld1	{ v0.4h }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER    ..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [1,1]     D========eeER  ..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [2,1]     D==========eeER..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [3,1]     D============eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+
+# CHECK:      [5] Code Region - SIMD64-bit-s
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ld1	{ v0.2s }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER    ..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [1,1]     D========eeER  ..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [2,1]     D==========eeER..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [3,1]     D============eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+
+# CHECK:      [6] Code Region - SIMD64-bit-d
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ld1	{ v0.1d }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER    ..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [1,1]     D========eeER  ..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [2,1]     D==========eeER..   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [3,1]     D============eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+
+# CHECK:      [7] Code Region - insr
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      803
+# CHECK-NEXT: Total uOps:        300
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      6     1.00                        insr	z0.s, w0
+# CHECK-NEXT:  1      2     0.25                        add	z0.s, z0.s, z0.s
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -      -     0.33   1.00   0.33   0.34
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -      -      -     1.00    -      -     insr	z0.s, w0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.33    -     0.33   0.34   add	z0.s, z0.s, z0.s
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .   .   insr	z0.s, w0
+# CHECK-NEXT: [0,1]     D======eeER    .    .    .    .   .   add	z0.s, z0.s, z0.s
+# CHECK-NEXT: [1,0]     D========eeeeeeER   .    .    .   .   insr	z0.s, w0
+# CHECK-NEXT: [1,1]     D==============eeER .    .    .   .   add	z0.s, z0.s, z0.s
+# CHECK-NEXT: [2,0]     D================eeeeeeER.    .   .   insr	z0.s, w0
+# CHECK-NEXT: [2,1]     D======================eeER   .   .   add	z0.s, z0.s, z0.s
+# CHECK-NEXT: [3,0]     D========================eeeeeeER .   insr	z0.s, w0
+# CHECK-NEXT: [3,1]     D==============================eeER   add	z0.s, z0.s, z0.s
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     13.0   0.3    0.0       insr	z0.s, w0
+# CHECK-NEXT: 1.     4     19.0   0.0    0.0       add	z0.s, z0.s, z0.s
+# CHECK-NEXT:        4     16.0   0.1    0.0       <total>

>From a92c9ec38f97260f9a95486df10eebdbffabc716 Mon Sep 17 00:00:00 2001
From: Rin Dobrescu <rin.dobrescu at arm.com>
Date: Fri, 17 May 2024 11:10:15 +0000
Subject: [PATCH 2/3] [llvm-mca][AArch64] Add AArch64 version of
 clearsSuperRegisters.

---
 .../MCTargetDesc/AArch64MCTargetDesc.cpp      |  46 ++++
 .../AArch64/Neoverse/V1-clear-upper-regs.s    | 245 +++++++++---------
 .../AArch64/Neoverse/V2-clear-upper-regs.s    | 210 +++++++--------
 3 files changed, 273 insertions(+), 228 deletions(-)

diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 043f0a03b7975..d545cfefc319d 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -432,6 +432,52 @@ class AArch64MCInstrAnalysis : public MCInstrAnalysis {
     return false;
   }
 
+  bool clearsSuperRegisters(const MCRegisterInfo &MRI, const MCInst &Inst,
+                            APInt &Mask) const override {
+    const MCInstrDesc &Desc = Info->get(Inst.getOpcode());
+    unsigned NumDefs = Desc.getNumDefs();
+    unsigned NumImplicitDefs = Desc.implicit_defs().size();
+    assert(Mask.getBitWidth() == NumDefs + NumImplicitDefs &&
+           "Unexpected number of bits in the mask!");
+    // 32-bit General Purpose Register class.
+    const MCRegisterClass &GPR32RC = MRI.getRegClass(AArch64::GPR32RegClassID);
+    // Floating Point Register classes lower than 128-bit.
+    const MCRegisterClass &FPR8RC = MRI.getRegClass(AArch64::FPR8RegClassID);
+    const MCRegisterClass &FPR16RC = MRI.getRegClass(AArch64::FPR16RegClassID);
+    const MCRegisterClass &FPR32RC = MRI.getRegClass(AArch64::FPR32RegClassID);
+    const MCRegisterClass &FPR64RC = MRI.getRegClass(AArch64::FPR64RegClassID);
+
+    auto ClearsSuperReg = [=](unsigned RegID) {
+      // An update to the lower 32 bits of a 64 bit integer register is
+      // architecturally defined to zero extend the upper 32 bits on a write.
+      if (GPR32RC.contains(RegID))
+        return true;
+      // SIMD&FP instructions operating on scalar data only acccess the lower
+      // bits of a register, the upper bits are zero extended on a write. For
+      // SIMD vector registers smaller than 128-bits, the upper 64-bits of the
+      // register are zero extended on a write.
+      if (FPR8RC.contains(RegID) || FPR16RC.contains(RegID) ||
+          FPR32RC.contains(RegID) || FPR64RC.contains(RegID))
+        return true;
+      return false;
+    };
+
+    Mask.clearAllBits();
+    for (unsigned I = 0, E = NumDefs; I < E; ++I) {
+      const MCOperand &Op = Inst.getOperand(I);
+      if (ClearsSuperReg(Op.getReg()))
+        Mask.setBit(I);
+    }
+
+    for (unsigned I = 0, E = NumImplicitDefs; I < E; ++I) {
+      const MCPhysReg Reg = Desc.implicit_defs()[I];
+      if (ClearsSuperReg(Reg))
+        Mask.setBit(NumDefs + I);
+    }
+
+    return Mask.getBoolValue();
+  }
+
   std::vector<std::pair<uint64_t, uint64_t>>
   findPltEntries(uint64_t PltSectionVA, ArrayRef<uint8_t> PltContents,
                  const Triple &TargetTriple) const override {
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
index 197996d229b61..ab81f9fb04af3 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
@@ -55,12 +55,12 @@ add v0.16b, v0.16b, v0.16b
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      107
+# CHECK-NEXT: Total Cycles:      41
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    1.87
-# CHECK-NEXT: IPC:               1.87
+# CHECK-NEXT: uOps Per Cycle:    4.88
+# CHECK-NEXT: IPC:               4.88
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -94,25 +94,24 @@ add v0.16b, v0.16b, v0.16b
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
-# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -     0.50   0.50    -      -      -      -
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34   0.22   0.22   0.28   0.28    -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ldr	w0, [sp]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     add	x0, x0, x0
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.22   0.22   0.28   0.28    -      -      -      -     add	x0, x0, x0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DeeeeER   .   ldr	w0, [sp]
-# CHECK-NEXT: [0,1]     D====eER  .   add	x0, x0, x0
-# CHECK-NEXT: [1,0]     DeeeeE-R  .   ldr	w0, [sp]
-# CHECK-NEXT: [1,1]     D=====eER .   add	x0, x0, x0
-# CHECK-NEXT: [2,0]     DeeeeE--R .   ldr	w0, [sp]
-# CHECK-NEXT: [2,1]     D======eER.   add	x0, x0, x0
-# CHECK-NEXT: [3,0]     D=eeeeE--R.   ldr	w0, [sp]
-# CHECK-NEXT: [3,1]     D=======eER   add	x0, x0, x0
+# CHECK:      [0,0]     DeeeeER .   ldr	w0, [sp]
+# CHECK-NEXT: [0,1]     D====eER.   add	x0, x0, x0
+# CHECK-NEXT: [1,0]     DeeeeE-R.   ldr	w0, [sp]
+# CHECK-NEXT: [1,1]     D====eER.   add	x0, x0, x0
+# CHECK-NEXT: [2,0]     DeeeeE-R.   ldr	w0, [sp]
+# CHECK-NEXT: [2,1]     D====eER.   add	x0, x0, x0
+# CHECK-NEXT: [3,0]     D=eeeeER.   ldr	w0, [sp]
+# CHECK-NEXT: [3,1]     D=====eER   add	x0, x0, x0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -121,20 +120,20 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	w0, [sp]
-# CHECK-NEXT: 1.     4     6.5    0.0    0.0       add	x0, x0, x0
-# CHECK-NEXT:        4     3.9    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    0.5       ldr	w0, [sp]
+# CHECK-NEXT: 1.     4     5.3    0.0    0.0       add	x0, x0, x0
+# CHECK-NEXT:        4     3.3    0.6    0.3       <total>
 
 # CHECK:      [1] Code Region - FPR8-bit
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.96
-# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -176,17 +175,17 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   fadd	d0, d0, d0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ldr	b0, [sp]
-# CHECK-NEXT: [0,1]     D======eeER    ..   fadd	d0, d0, d0
-# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ldr	b0, [sp]
-# CHECK-NEXT: [1,1]     D========eeER  ..   fadd	d0, d0, d0
-# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ldr	b0, [sp]
-# CHECK-NEXT: [2,1]     D==========eeER..   fadd	d0, d0, d0
-# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ldr	b0, [sp]
-# CHECK-NEXT: [3,1]     D============eeER   fadd	d0, d0, d0
+# CHECK:      [0,0]     DeeeeeeER ..   ldr	b0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ldr	b0, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	b0, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	b0, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   fadd	d0, d0, d0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -195,20 +194,20 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ldr	b0, [sp]
-# CHECK-NEXT: 1.     4     10.0   0.0    0.0       fadd	d0, d0, d0
-# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	b0, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       fadd	d0, d0, d0
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
 
 # CHECK:      [2] Code Region - FPR16-bit
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.96
-# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -250,17 +249,17 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   fadd	d0, d0, d0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ldr	h0, [sp]
-# CHECK-NEXT: [0,1]     D======eeER    ..   fadd	d0, d0, d0
-# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ldr	h0, [sp]
-# CHECK-NEXT: [1,1]     D========eeER  ..   fadd	d0, d0, d0
-# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ldr	h0, [sp]
-# CHECK-NEXT: [2,1]     D==========eeER..   fadd	d0, d0, d0
-# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ldr	h0, [sp]
-# CHECK-NEXT: [3,1]     D============eeER   fadd	d0, d0, d0
+# CHECK:      [0,0]     DeeeeeeER ..   ldr	h0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ldr	h0, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	h0, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	h0, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   fadd	d0, d0, d0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -269,20 +268,20 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ldr	h0, [sp]
-# CHECK-NEXT: 1.     4     10.0   0.0    0.0       fadd	d0, d0, d0
-# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	h0, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       fadd	d0, d0, d0
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
 
 # CHECK:      [3] Code Region - FPR32-bit
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.96
-# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -324,17 +323,17 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   fadd	d0, d0, d0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ldr	s0, [sp]
-# CHECK-NEXT: [0,1]     D======eeER    ..   fadd	d0, d0, d0
-# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ldr	s0, [sp]
-# CHECK-NEXT: [1,1]     D========eeER  ..   fadd	d0, d0, d0
-# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ldr	s0, [sp]
-# CHECK-NEXT: [2,1]     D==========eeER..   fadd	d0, d0, d0
-# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ldr	s0, [sp]
-# CHECK-NEXT: [3,1]     D============eeER   fadd	d0, d0, d0
+# CHECK:      [0,0]     DeeeeeeER ..   ldr	s0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ldr	s0, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	s0, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	s0, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   fadd	d0, d0, d0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -343,20 +342,20 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ldr	s0, [sp]
-# CHECK-NEXT: 1.     4     10.0   0.0    0.0       fadd	d0, d0, d0
-# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	s0, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       fadd	d0, d0, d0
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
 
 # CHECK:      [4] Code Region - SIMD64-bit-b
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.96
-# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -398,17 +397,17 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	v0.16b, v0.16b, v0.16b
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.8b }, [sp]
-# CHECK-NEXT: [0,1]     D======eeER    ..   add	v0.16b, v0.16b, v0.16b
-# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.8b }, [sp]
-# CHECK-NEXT: [1,1]     D========eeER  ..   add	v0.16b, v0.16b, v0.16b
-# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.8b }, [sp]
-# CHECK-NEXT: [2,1]     D==========eeER..   add	v0.16b, v0.16b, v0.16b
-# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.8b }, [sp]
-# CHECK-NEXT: [3,1]     D============eeER   add	v0.16b, v0.16b, v0.16b
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	v0.16b, v0.16b, v0.16b
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -417,20 +416,20 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.8b }, [sp]
-# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	v0.16b, v0.16b, v0.16b
-# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
 
 # CHECK:      [5] Code Region - SIMD64-bit-h
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.96
-# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -472,17 +471,17 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	v0.8h, v0.8h, v0.8h
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.4h }, [sp]
-# CHECK-NEXT: [0,1]     D======eeER    ..   add	v0.8h, v0.8h, v0.8h
-# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.4h }, [sp]
-# CHECK-NEXT: [1,1]     D========eeER  ..   add	v0.8h, v0.8h, v0.8h
-# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.4h }, [sp]
-# CHECK-NEXT: [2,1]     D==========eeER..   add	v0.8h, v0.8h, v0.8h
-# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.4h }, [sp]
-# CHECK-NEXT: [3,1]     D============eeER   add	v0.8h, v0.8h, v0.8h
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	v0.8h, v0.8h, v0.8h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -491,20 +490,20 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.4h }, [sp]
-# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	v0.8h, v0.8h, v0.8h
-# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
 
 # CHECK:      [6] Code Region - SIMD64-bit-s
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.96
-# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -546,17 +545,17 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	v0.4s, v0.4s, v0.4s
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.2s }, [sp]
-# CHECK-NEXT: [0,1]     D======eeER    ..   add	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.2s }, [sp]
-# CHECK-NEXT: [1,1]     D========eeER  ..   add	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.2s }, [sp]
-# CHECK-NEXT: [2,1]     D==========eeER..   add	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.2s }, [sp]
-# CHECK-NEXT: [3,1]     D============eeER   add	v0.4s, v0.4s, v0.4s
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	v0.4s, v0.4s, v0.4s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -565,20 +564,20 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.2s }, [sp]
-# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
 
 # CHECK:      [7] Code Region - SIMD64-bit-d
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.96
-# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -620,17 +619,17 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	v0.2d, v0.2d, v0.2d
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.1d }, [sp]
-# CHECK-NEXT: [0,1]     D======eeER    ..   add	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.1d }, [sp]
-# CHECK-NEXT: [1,1]     D========eeER  ..   add	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.1d }, [sp]
-# CHECK-NEXT: [2,1]     D==========eeER..   add	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.1d }, [sp]
-# CHECK-NEXT: [3,1]     D============eeER   add	v0.2d, v0.2d, v0.2d
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	v0.2d, v0.2d, v0.2d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -639,9 +638,9 @@ add v0.16b, v0.16b, v0.16b
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.1d }, [sp]
-# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
 
 # CHECK:      [8] Code Region - ins
 
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
index 708f22d54afd3..20c97eeaa8733 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
@@ -45,12 +45,12 @@ add z0.s, z0.s, z0.s
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    0.96
-# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -94,17 +94,17 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ldr	b0, [sp]
-# CHECK-NEXT: [0,1]     D======eeER    ..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ldr	b0, [sp]
-# CHECK-NEXT: [1,1]     D========eeER  ..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ldr	b0, [sp]
-# CHECK-NEXT: [2,1]     D==========eeER..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ldr	b0, [sp]
-# CHECK-NEXT: [3,1]     D============eeER   add	z0.d, z0.d, z0.d
+# CHECK:      [0,0]     DeeeeeeER ..   ldr	b0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ldr	b0, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	b0, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	b0, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -113,20 +113,20 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ldr	b0, [sp]
-# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	b0, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
 
 # CHECK:      [1] Code Region - FPR16-bit
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    0.96
-# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -170,17 +170,17 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ldr	h0, [sp]
-# CHECK-NEXT: [0,1]     D======eeER    ..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ldr	h0, [sp]
-# CHECK-NEXT: [1,1]     D========eeER  ..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ldr	h0, [sp]
-# CHECK-NEXT: [2,1]     D==========eeER..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ldr	h0, [sp]
-# CHECK-NEXT: [3,1]     D============eeER   add	z0.d, z0.d, z0.d
+# CHECK:      [0,0]     DeeeeeeER ..   ldr	h0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ldr	h0, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	h0, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	h0, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -189,20 +189,20 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ldr	h0, [sp]
-# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	h0, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
 
 # CHECK:      [2] Code Region - FPR32-bit
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    0.96
-# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -246,17 +246,17 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ldr	s0, [sp]
-# CHECK-NEXT: [0,1]     D======eeER    ..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ldr	s0, [sp]
-# CHECK-NEXT: [1,1]     D========eeER  ..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ldr	s0, [sp]
-# CHECK-NEXT: [2,1]     D==========eeER..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ldr	s0, [sp]
-# CHECK-NEXT: [3,1]     D============eeER   add	z0.d, z0.d, z0.d
+# CHECK:      [0,0]     DeeeeeeER ..   ldr	s0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ldr	s0, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	s0, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	s0, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -265,20 +265,20 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ldr	s0, [sp]
-# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	s0, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
 
 # CHECK:      [3] Code Region - SIMD64-bit-b
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    0.96
-# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -322,17 +322,17 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.8b }, [sp]
-# CHECK-NEXT: [0,1]     D======eeER    ..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.8b }, [sp]
-# CHECK-NEXT: [1,1]     D========eeER  ..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.8b }, [sp]
-# CHECK-NEXT: [2,1]     D==========eeER..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.8b }, [sp]
-# CHECK-NEXT: [3,1]     D============eeER   add	z0.d, z0.d, z0.d
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -341,20 +341,20 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.8b }, [sp]
-# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
 
 # CHECK:      [4] Code Region - SIMD64-bit-h
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    0.96
-# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -398,17 +398,17 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.4h }, [sp]
-# CHECK-NEXT: [0,1]     D======eeER    ..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.4h }, [sp]
-# CHECK-NEXT: [1,1]     D========eeER  ..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.4h }, [sp]
-# CHECK-NEXT: [2,1]     D==========eeER..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.4h }, [sp]
-# CHECK-NEXT: [3,1]     D============eeER   add	z0.d, z0.d, z0.d
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -417,20 +417,20 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.4h }, [sp]
-# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
 
 # CHECK:      [5] Code Region - SIMD64-bit-s
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    0.96
-# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -474,17 +474,17 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.2s }, [sp]
-# CHECK-NEXT: [0,1]     D======eeER    ..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.2s }, [sp]
-# CHECK-NEXT: [1,1]     D========eeER  ..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.2s }, [sp]
-# CHECK-NEXT: [2,1]     D==========eeER..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.2s }, [sp]
-# CHECK-NEXT: [3,1]     D============eeER   add	z0.d, z0.d, z0.d
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -493,20 +493,20 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.2s }, [sp]
-# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
 
 # CHECK:      [6] Code Region - SIMD64-bit-d
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      200
-# CHECK-NEXT: Total Cycles:      209
+# CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
 # CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    0.96
-# CHECK-NEXT: IPC:               0.96
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
 
 # CHECK:      Instruction Info:
@@ -550,17 +550,17 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v0.1d }, [sp]
-# CHECK-NEXT: [0,1]     D======eeER    ..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,0]     DeeeeeeE--R    ..   ld1	{ v0.1d }, [sp]
-# CHECK-NEXT: [1,1]     D========eeER  ..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [2,0]     DeeeeeeE----R  ..   ld1	{ v0.1d }, [sp]
-# CHECK-NEXT: [2,1]     D==========eeER..   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-----R..   ld1	{ v0.1d }, [sp]
-# CHECK-NEXT: [3,1]     D============eeER   add	z0.d, z0.d, z0.d
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -569,9 +569,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    2.8       ld1	{ v0.1d }, [sp]
-# CHECK-NEXT: 1.     4     10.0   0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     5.6    0.6    1.4       <total>
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
 
 # CHECK:      [7] Code Region - insr
 

>From b2abd342d8128be4dfa022ade56eca832fe4d26d Mon Sep 17 00:00:00 2001
From: Rin Dobrescu <rin.dobrescu at arm.com>
Date: Mon, 20 May 2024 10:41:04 +0000
Subject: [PATCH 3/3] Address comment

---
 .../lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index d545cfefc319d..3247fe03bb488 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -456,10 +456,8 @@ class AArch64MCInstrAnalysis : public MCInstrAnalysis {
       // bits of a register, the upper bits are zero extended on a write. For
       // SIMD vector registers smaller than 128-bits, the upper 64-bits of the
       // register are zero extended on a write.
-      if (FPR8RC.contains(RegID) || FPR16RC.contains(RegID) ||
-          FPR32RC.contains(RegID) || FPR64RC.contains(RegID))
-        return true;
-      return false;
+      return FPR8RC.contains(RegID) || FPR16RC.contains(RegID) ||
+             FPR32RC.contains(RegID) || FPR64RC.contains(RegID);
     };
 
     Mask.clearAllBits();



More information about the llvm-commits mailing list