[llvm] [llvm-mca][AArch64] Add AArch64 version of clearsSuperRegisters. (PR #92548)
Rin Dobrescu via llvm-commits
llvm-commits at lists.llvm.org
Fri May 17 06:57:48 PDT 2024
https://github.com/Rin18 created https://github.com/llvm/llvm-project/pull/92548
This patch overrides the clearsSuperRegisters method defined in MCInstrAnalysis to identify register writes that clear the upper portion of all super-registers on AArch64 architecture.
On AArch64, a write to a general-purpose register of 32-bit data size is defined to use the lower 32-bits of the register and zero extend the upper 32-bits.
Similarly, SIMD and FP instructions operating on scalar data only access the lower bits of the SIMD&FP register. The unused upper bits are cleared to zero on a write.
This also applies to SIMD vector registers when the element size in bits multiplied by the number of lanes is lower than 128. The upper 64 bits of the vector register are cleared to zero on a write.
>From 7d13c3afd29035a26588e6c3e06bb348f1971375 Mon Sep 17 00:00:00 2001
From: Rin Dobrescu <rin.dobrescu at arm.com>
Date: Thu, 16 May 2024 11:26:09 +0000
Subject: [PATCH 1/2] Precommit tests.
---
.../AArch64/Neoverse/V1-clear-upper-regs.s | 792 ++++++++++++++++++
.../AArch64/Neoverse/V2-clear-upper-regs.s | 650 ++++++++++++++
2 files changed, 1442 insertions(+)
create mode 100644 llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
create mode 100644 llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
new file mode 100644
index 0000000000000..197996d229b61
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
@@ -0,0 +1,792 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-v1 --timeline --timeline-max-iterations=4 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN GPR32-bit
+ldr w0, [sp]
+add x0, x0, x0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR8-bit
+ldr b0, [sp]
+fadd d0, d0, d0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR16-bit
+ldr h0, [sp]
+fadd d0, d0, d0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR32-bit
+ldr s0, [sp]
+fadd d0, d0, d0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-b
+ld1 {v0.8b}, [sp]
+add v0.16b, v0.16b, v0.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-h
+ld1 {v0.4h}, [sp]
+add v0.8h, v0.8h, v0.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-s
+ld1 {v0.2s}, [sp]
+add v0.4s, v0.4s, v0.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-d
+ld1 {v0.1d}, [sp]
+add v0.2d, v0.2d, v0.2d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN ins
+ins v0.b[0], v1.b[1]
+add v0.16b, v0.16b, v0.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN lanewise-load
+ld1 {v0.b}[0], [sp]
+add v0.16b, v0.16b, v0.16b
+# LLVM-MCA-END
+
+# CHECK: [0] Code Region - GPR32-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 107
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 1.87
+# CHECK-NEXT: IPC: 1.87
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 4 0.33 * ldr w0, [sp]
+# CHECK-NEXT: 1 1 0.25 add x0, x0, x0
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - 0.50 0.50 - - - -
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ldr w0, [sp]
+# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - add x0, x0, x0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeER . ldr w0, [sp]
+# CHECK-NEXT: [0,1] D====eER . add x0, x0, x0
+# CHECK-NEXT: [1,0] DeeeeE-R . ldr w0, [sp]
+# CHECK-NEXT: [1,1] D=====eER . add x0, x0, x0
+# CHECK-NEXT: [2,0] DeeeeE--R . ldr w0, [sp]
+# CHECK-NEXT: [2,1] D======eER. add x0, x0, x0
+# CHECK-NEXT: [3,0] D=eeeeE--R. ldr w0, [sp]
+# CHECK-NEXT: [3,1] D=======eER add x0, x0, x0
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr w0, [sp]
+# CHECK-NEXT: 1. 4 6.5 0.0 0.0 add x0, x0, x0
+# CHECK-NEXT: 4 3.9 0.6 0.6 <total>
+
+# CHECK: [1] Code Region - FPR8-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ldr b0, [sp]
+# CHECK-NEXT: 1 2 0.25 fadd d0, d0, d0
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ldr b0, [sp]
+# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 fadd d0, d0, d0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. ldr b0, [sp]
+# CHECK-NEXT: [0,1] D======eeER .. fadd d0, d0, d0
+# CHECK-NEXT: [1,0] DeeeeeeE--R .. ldr b0, [sp]
+# CHECK-NEXT: [1,1] D========eeER .. fadd d0, d0, d0
+# CHECK-NEXT: [2,0] DeeeeeeE----R .. ldr b0, [sp]
+# CHECK-NEXT: [2,1] D==========eeER.. fadd d0, d0, d0
+# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ldr b0, [sp]
+# CHECK-NEXT: [3,1] D============eeER fadd d0, d0, d0
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ldr b0, [sp]
+# CHECK-NEXT: 1. 4 10.0 0.0 0.0 fadd d0, d0, d0
+# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+
+# CHECK: [2] Code Region - FPR16-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ldr h0, [sp]
+# CHECK-NEXT: 1 2 0.25 fadd d0, d0, d0
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ldr h0, [sp]
+# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 fadd d0, d0, d0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. ldr h0, [sp]
+# CHECK-NEXT: [0,1] D======eeER .. fadd d0, d0, d0
+# CHECK-NEXT: [1,0] DeeeeeeE--R .. ldr h0, [sp]
+# CHECK-NEXT: [1,1] D========eeER .. fadd d0, d0, d0
+# CHECK-NEXT: [2,0] DeeeeeeE----R .. ldr h0, [sp]
+# CHECK-NEXT: [2,1] D==========eeER.. fadd d0, d0, d0
+# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ldr h0, [sp]
+# CHECK-NEXT: [3,1] D============eeER fadd d0, d0, d0
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ldr h0, [sp]
+# CHECK-NEXT: 1. 4 10.0 0.0 0.0 fadd d0, d0, d0
+# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+
+# CHECK: [3] Code Region - FPR32-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ldr s0, [sp]
+# CHECK-NEXT: 1 2 0.25 fadd d0, d0, d0
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ldr s0, [sp]
+# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 fadd d0, d0, d0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. ldr s0, [sp]
+# CHECK-NEXT: [0,1] D======eeER .. fadd d0, d0, d0
+# CHECK-NEXT: [1,0] DeeeeeeE--R .. ldr s0, [sp]
+# CHECK-NEXT: [1,1] D========eeER .. fadd d0, d0, d0
+# CHECK-NEXT: [2,0] DeeeeeeE----R .. ldr s0, [sp]
+# CHECK-NEXT: [2,1] D==========eeER.. fadd d0, d0, d0
+# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ldr s0, [sp]
+# CHECK-NEXT: [3,1] D============eeER fadd d0, d0, d0
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ldr s0, [sp]
+# CHECK-NEXT: 1. 4 10.0 0.0 0.0 fadd d0, d0, d0
+# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+
+# CHECK: [4] Code Region - SIMD64-bit-b
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ld1 { v0.8b }, [sp]
+# CHECK-NEXT: 1 2 0.25 add v0.16b, v0.16b, v0.16b
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ld1 { v0.8b }, [sp]
+# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 add v0.16b, v0.16b, v0.16b
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [0,1] D======eeER .. add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [1,1] D========eeER .. add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [2,1] D==========eeER.. add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [3,1] D============eeER add v0.16b, v0.16b, v0.16b
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.8b }, [sp]
+# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+
+# CHECK: [5] Code Region - SIMD64-bit-h
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ld1 { v0.4h }, [sp]
+# CHECK-NEXT: 1 2 0.25 add v0.8h, v0.8h, v0.8h
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ld1 { v0.4h }, [sp]
+# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 add v0.8h, v0.8h, v0.8h
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [0,1] D======eeER .. add v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [1,1] D========eeER .. add v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [2,1] D==========eeER.. add v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [3,1] D============eeER add v0.8h, v0.8h, v0.8h
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.4h }, [sp]
+# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+
+# CHECK: [6] Code Region - SIMD64-bit-s
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ld1 { v0.2s }, [sp]
+# CHECK-NEXT: 1 2 0.25 add v0.4s, v0.4s, v0.4s
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ld1 { v0.2s }, [sp]
+# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 add v0.4s, v0.4s, v0.4s
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [0,1] D======eeER .. add v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [1,1] D========eeER .. add v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [2,1] D==========eeER.. add v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [3,1] D============eeER add v0.4s, v0.4s, v0.4s
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.2s }, [sp]
+# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+
+# CHECK: [7] Code Region - SIMD64-bit-d
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ld1 { v0.1d }, [sp]
+# CHECK-NEXT: 1 2 0.25 add v0.2d, v0.2d, v0.2d
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ld1 { v0.1d }, [sp]
+# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 add v0.2d, v0.2d, v0.2d
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [0,1] D======eeER .. add v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [1,1] D========eeER .. add v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [2,1] D==========eeER.. add v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [3,1] D============eeER add v0.2d, v0.2d, v0.2d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.1d }, [sp]
+# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+
+# CHECK: [8] Code Region - ins
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 403
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 0.50
+# CHECK-NEXT: IPC: 0.50
+# CHECK-NEXT: Block RThroughput: 0.5
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 2 0.25 mov v0.b[0], v1.b[1]
+# CHECK-NEXT: 1 2 0.25 add v0.16b, v0.16b, v0.16b
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 0.50 0.50
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - - - - - - - - - 0.50 - 0.50 mov v0.b[0], v1.b[1]
+# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - add v0.16b, v0.16b, v0.16b
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 012345678
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeER. . . . mov v0.b[0], v1.b[1]
+# CHECK-NEXT: [0,1] D==eeER . . . add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [1,0] D====eeER . . . mov v0.b[0], v1.b[1]
+# CHECK-NEXT: [1,1] D======eeER . . add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [2,0] D========eeER . . mov v0.b[0], v1.b[1]
+# CHECK-NEXT: [2,1] D==========eeER. . add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [3,0] D============eeER . mov v0.b[0], v1.b[1]
+# CHECK-NEXT: [3,1] D==============eeER add v0.16b, v0.16b, v0.16b
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 7.0 0.3 0.0 mov v0.b[0], v1.b[1]
+# CHECK-NEXT: 1. 4 9.0 0.0 0.0 add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: 4 8.0 0.1 0.0 <total>
+
+# CHECK: [9] Code Region - lanewise-load
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 1003
+# CHECK-NEXT: Total uOps: 300
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 0.30
+# CHECK-NEXT: IPC: 0.20
+# CHECK-NEXT: Block RThroughput: 0.5
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 2 8 0.33 * ld1 { v0.b }[0], [sp]
+# CHECK-NEXT: 1 2 0.25 add v0.16b, v0.16b, v0.16b
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 0.50 0.50
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - 0.50 - 0.50 ld1 { v0.b }[0], [sp]
+# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - add v0.16b, v0.16b, v0.16b
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456789
+# CHECK-NEXT: Index 0123456789 0123456789 012
+
+# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld1 { v0.b }[0], [sp]
+# CHECK-NEXT: [0,1] D========eeER . . . . . . . add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [1,0] D==========eeeeeeeeER . . . . . ld1 { v0.b }[0], [sp]
+# CHECK-NEXT: [1,1] D==================eeER . . . . . add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [2,0] D====================eeeeeeeeER . . . ld1 { v0.b }[0], [sp]
+# CHECK-NEXT: [2,1] D============================eeER . . . add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [3,0] D==============================eeeeeeeeER . ld1 { v0.b }[0], [sp]
+# CHECK-NEXT: [3,1] D======================================eeER add v0.16b, v0.16b, v0.16b
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 16.0 0.3 0.0 ld1 { v0.b }[0], [sp]
+# CHECK-NEXT: 1. 4 24.0 0.0 0.0 add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: 4 20.0 0.1 0.0 <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
new file mode 100644
index 0000000000000..708f22d54afd3
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
@@ -0,0 +1,650 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-v2 --timeline --timeline-max-iterations=4 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN FPR8-bit
+ldr b0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR16-bit
+ldr h0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR32-bit
+ldr s0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-b
+ld1 {v0.8b}, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-h
+ld1 {v0.4h}, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-s
+ld1 {v0.2s}, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-d
+ld1 {v0.1d}, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN insr
+insr z0.s, w0
+add z0.s, z0.s, z0.s
+# LLVM-MCA-END
+
+# CHECK: [0] Code Region - FPR8-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ldr b0, [sp]
+# CHECK-NEXT: 1 2 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2] - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4] - V2UnitM0
+# CHECK-NEXT: [5] - V2UnitM1
+# CHECK-NEXT: [6] - V2UnitS0
+# CHECK-NEXT: [7] - V2UnitS1
+# CHECK-NEXT: [8] - V2UnitS2
+# CHECK-NEXT: [9] - V2UnitS3
+# CHECK-NEXT: [10] - V2UnitV0
+# CHECK-NEXT: [11] - V2UnitV1
+# CHECK-NEXT: [12] - V2UnitV2
+# CHECK-NEXT: [13] - V2UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - - - ldr b0, [sp]
+# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. ldr b0, [sp]
+# CHECK-NEXT: [0,1] D======eeER .. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R .. ldr b0, [sp]
+# CHECK-NEXT: [1,1] D========eeER .. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE----R .. ldr b0, [sp]
+# CHECK-NEXT: [2,1] D==========eeER.. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ldr b0, [sp]
+# CHECK-NEXT: [3,1] D============eeER add z0.d, z0.d, z0.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ldr b0, [sp]
+# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+
+# CHECK: [1] Code Region - FPR16-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ldr h0, [sp]
+# CHECK-NEXT: 1 2 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2] - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4] - V2UnitM0
+# CHECK-NEXT: [5] - V2UnitM1
+# CHECK-NEXT: [6] - V2UnitS0
+# CHECK-NEXT: [7] - V2UnitS1
+# CHECK-NEXT: [8] - V2UnitS2
+# CHECK-NEXT: [9] - V2UnitS3
+# CHECK-NEXT: [10] - V2UnitV0
+# CHECK-NEXT: [11] - V2UnitV1
+# CHECK-NEXT: [12] - V2UnitV2
+# CHECK-NEXT: [13] - V2UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - - - ldr h0, [sp]
+# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. ldr h0, [sp]
+# CHECK-NEXT: [0,1] D======eeER .. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R .. ldr h0, [sp]
+# CHECK-NEXT: [1,1] D========eeER .. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE----R .. ldr h0, [sp]
+# CHECK-NEXT: [2,1] D==========eeER.. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ldr h0, [sp]
+# CHECK-NEXT: [3,1] D============eeER add z0.d, z0.d, z0.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ldr h0, [sp]
+# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+
+# CHECK: [2] Code Region - FPR32-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ldr s0, [sp]
+# CHECK-NEXT: 1 2 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2] - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4] - V2UnitM0
+# CHECK-NEXT: [5] - V2UnitM1
+# CHECK-NEXT: [6] - V2UnitS0
+# CHECK-NEXT: [7] - V2UnitS1
+# CHECK-NEXT: [8] - V2UnitS2
+# CHECK-NEXT: [9] - V2UnitS3
+# CHECK-NEXT: [10] - V2UnitV0
+# CHECK-NEXT: [11] - V2UnitV1
+# CHECK-NEXT: [12] - V2UnitV2
+# CHECK-NEXT: [13] - V2UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - - - ldr s0, [sp]
+# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. ldr s0, [sp]
+# CHECK-NEXT: [0,1] D======eeER .. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R .. ldr s0, [sp]
+# CHECK-NEXT: [1,1] D========eeER .. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE----R .. ldr s0, [sp]
+# CHECK-NEXT: [2,1] D==========eeER.. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ldr s0, [sp]
+# CHECK-NEXT: [3,1] D============eeER add z0.d, z0.d, z0.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ldr s0, [sp]
+# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+
+# CHECK: [3] Code Region - SIMD64-bit-b
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ld1 { v0.8b }, [sp]
+# CHECK-NEXT: 1 2 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2] - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4] - V2UnitM0
+# CHECK-NEXT: [5] - V2UnitM1
+# CHECK-NEXT: [6] - V2UnitS0
+# CHECK-NEXT: [7] - V2UnitS1
+# CHECK-NEXT: [8] - V2UnitS2
+# CHECK-NEXT: [9] - V2UnitS3
+# CHECK-NEXT: [10] - V2UnitV0
+# CHECK-NEXT: [11] - V2UnitV1
+# CHECK-NEXT: [12] - V2UnitV2
+# CHECK-NEXT: [13] - V2UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - - - ld1 { v0.8b }, [sp]
+# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [0,1] D======eeER .. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [1,1] D========eeER .. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [2,1] D==========eeER.. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [3,1] D============eeER add z0.d, z0.d, z0.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.8b }, [sp]
+# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+
+# CHECK: [4] Code Region - SIMD64-bit-h
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ld1 { v0.4h }, [sp]
+# CHECK-NEXT: 1 2 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2] - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4] - V2UnitM0
+# CHECK-NEXT: [5] - V2UnitM1
+# CHECK-NEXT: [6] - V2UnitS0
+# CHECK-NEXT: [7] - V2UnitS1
+# CHECK-NEXT: [8] - V2UnitS2
+# CHECK-NEXT: [9] - V2UnitS3
+# CHECK-NEXT: [10] - V2UnitV0
+# CHECK-NEXT: [11] - V2UnitV1
+# CHECK-NEXT: [12] - V2UnitV2
+# CHECK-NEXT: [13] - V2UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - - - ld1 { v0.4h }, [sp]
+# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [0,1] D======eeER .. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [1,1] D========eeER .. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [2,1] D==========eeER.. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [3,1] D============eeER add z0.d, z0.d, z0.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.4h }, [sp]
+# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+
+# CHECK: [5] Code Region - SIMD64-bit-s
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ld1 { v0.2s }, [sp]
+# CHECK-NEXT: 1 2 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2] - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4] - V2UnitM0
+# CHECK-NEXT: [5] - V2UnitM1
+# CHECK-NEXT: [6] - V2UnitS0
+# CHECK-NEXT: [7] - V2UnitS1
+# CHECK-NEXT: [8] - V2UnitS2
+# CHECK-NEXT: [9] - V2UnitS3
+# CHECK-NEXT: [10] - V2UnitV0
+# CHECK-NEXT: [11] - V2UnitV1
+# CHECK-NEXT: [12] - V2UnitV2
+# CHECK-NEXT: [13] - V2UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - - - ld1 { v0.2s }, [sp]
+# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [0,1] D======eeER .. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [1,1] D========eeER .. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [2,1] D==========eeER.. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [3,1] D============eeER add z0.d, z0.d, z0.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.2s }, [sp]
+# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+
+# CHECK: [6] Code Region - SIMD64-bit-d
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ld1 { v0.1d }, [sp]
+# CHECK-NEXT: 1 2 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2] - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4] - V2UnitM0
+# CHECK-NEXT: [5] - V2UnitM1
+# CHECK-NEXT: [6] - V2UnitS0
+# CHECK-NEXT: [7] - V2UnitS1
+# CHECK-NEXT: [8] - V2UnitS2
+# CHECK-NEXT: [9] - V2UnitS3
+# CHECK-NEXT: [10] - V2UnitV0
+# CHECK-NEXT: [11] - V2UnitV1
+# CHECK-NEXT: [12] - V2UnitV2
+# CHECK-NEXT: [13] - V2UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - - - ld1 { v0.1d }, [sp]
+# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [0,1] D======eeER .. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [1,1] D========eeER .. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [2,1] D==========eeER.. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [3,1] D============eeER add z0.d, z0.d, z0.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.1d }, [sp]
+# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+
+# CHECK: [7] Code Region - insr
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 803
+# CHECK-NEXT: Total uOps: 300
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 0.37
+# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 2 6 1.00 insr z0.s, w0
+# CHECK-NEXT: 1 2 0.25 add z0.s, z0.s, z0.s
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2] - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4] - V2UnitM0
+# CHECK-NEXT: [5] - V2UnitM1
+# CHECK-NEXT: [6] - V2UnitS0
+# CHECK-NEXT: [7] - V2UnitS1
+# CHECK-NEXT: [8] - V2UnitS2
+# CHECK-NEXT: [9] - V2UnitS3
+# CHECK-NEXT: [10] - V2UnitV0
+# CHECK-NEXT: [11] - V2UnitV1
+# CHECK-NEXT: [12] - V2UnitV2
+# CHECK-NEXT: [13] - V2UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - - - - 1.00 - - - - - 0.33 1.00 0.33 0.34
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - - - - 1.00 - - - - - - 1.00 - - insr z0.s, w0
+# CHECK-NEXT: - - - - - - - - - - - - - 0.33 - 0.33 0.34 add z0.s, z0.s, z0.s
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 01234
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeeeER . . . . . . insr z0.s, w0
+# CHECK-NEXT: [0,1] D======eeER . . . . . add z0.s, z0.s, z0.s
+# CHECK-NEXT: [1,0] D========eeeeeeER . . . . insr z0.s, w0
+# CHECK-NEXT: [1,1] D==============eeER . . . . add z0.s, z0.s, z0.s
+# CHECK-NEXT: [2,0] D================eeeeeeER. . . insr z0.s, w0
+# CHECK-NEXT: [2,1] D======================eeER . . add z0.s, z0.s, z0.s
+# CHECK-NEXT: [3,0] D========================eeeeeeER . insr z0.s, w0
+# CHECK-NEXT: [3,1] D==============================eeER add z0.s, z0.s, z0.s
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 13.0 0.3 0.0 insr z0.s, w0
+# CHECK-NEXT: 1. 4 19.0 0.0 0.0 add z0.s, z0.s, z0.s
+# CHECK-NEXT: 4 16.0 0.1 0.0 <total>
>From a92c9ec38f97260f9a95486df10eebdbffabc716 Mon Sep 17 00:00:00 2001
From: Rin Dobrescu <rin.dobrescu at arm.com>
Date: Fri, 17 May 2024 11:10:15 +0000
Subject: [PATCH 2/2] [llvm-mca][AArch64] Add AArch64 version of
clearsSuperRegisters.
---
.../MCTargetDesc/AArch64MCTargetDesc.cpp | 46 ++++
.../AArch64/Neoverse/V1-clear-upper-regs.s | 245 +++++++++---------
.../AArch64/Neoverse/V2-clear-upper-regs.s | 210 +++++++--------
3 files changed, 273 insertions(+), 228 deletions(-)
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 043f0a03b7975..d545cfefc319d 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -432,6 +432,52 @@ class AArch64MCInstrAnalysis : public MCInstrAnalysis {
return false;
}
+ bool clearsSuperRegisters(const MCRegisterInfo &MRI, const MCInst &Inst,
+ APInt &Mask) const override {
+ const MCInstrDesc &Desc = Info->get(Inst.getOpcode());
+ unsigned NumDefs = Desc.getNumDefs();
+ unsigned NumImplicitDefs = Desc.implicit_defs().size();
+ assert(Mask.getBitWidth() == NumDefs + NumImplicitDefs &&
+ "Unexpected number of bits in the mask!");
+ // 32-bit General Purpose Register class.
+ const MCRegisterClass &GPR32RC = MRI.getRegClass(AArch64::GPR32RegClassID);
+ // Floating Point Register classes lower than 128-bit.
+ const MCRegisterClass &FPR8RC = MRI.getRegClass(AArch64::FPR8RegClassID);
+ const MCRegisterClass &FPR16RC = MRI.getRegClass(AArch64::FPR16RegClassID);
+ const MCRegisterClass &FPR32RC = MRI.getRegClass(AArch64::FPR32RegClassID);
+ const MCRegisterClass &FPR64RC = MRI.getRegClass(AArch64::FPR64RegClassID);
+
+ auto ClearsSuperReg = [=](unsigned RegID) {
+ // An update to the lower 32 bits of a 64 bit integer register is
+ // architecturally defined to zero extend the upper 32 bits on a write.
+ if (GPR32RC.contains(RegID))
+ return true;
+ // SIMD&FP instructions operating on scalar data only acccess the lower
+ // bits of a register, the upper bits are zero extended on a write. For
+ // SIMD vector registers smaller than 128-bits, the upper 64-bits of the
+ // register are zero extended on a write.
+ if (FPR8RC.contains(RegID) || FPR16RC.contains(RegID) ||
+ FPR32RC.contains(RegID) || FPR64RC.contains(RegID))
+ return true;
+ return false;
+ };
+
+ Mask.clearAllBits();
+ for (unsigned I = 0, E = NumDefs; I < E; ++I) {
+ const MCOperand &Op = Inst.getOperand(I);
+ if (ClearsSuperReg(Op.getReg()))
+ Mask.setBit(I);
+ }
+
+ for (unsigned I = 0, E = NumImplicitDefs; I < E; ++I) {
+ const MCPhysReg Reg = Desc.implicit_defs()[I];
+ if (ClearsSuperReg(Reg))
+ Mask.setBit(NumDefs + I);
+ }
+
+ return Mask.getBoolValue();
+ }
+
std::vector<std::pair<uint64_t, uint64_t>>
findPltEntries(uint64_t PltSectionVA, ArrayRef<uint8_t> PltContents,
const Triple &TargetTriple) const override {
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
index 197996d229b61..ab81f9fb04af3 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
@@ -55,12 +55,12 @@ add v0.16b, v0.16b, v0.16b
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 107
+# CHECK-NEXT: Total Cycles: 41
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 15
-# CHECK-NEXT: uOps Per Cycle: 1.87
-# CHECK-NEXT: IPC: 1.87
+# CHECK-NEXT: uOps Per Cycle: 4.88
+# CHECK-NEXT: IPC: 4.88
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -94,25 +94,24 @@ add v0.16b, v0.16b, v0.16b
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
-# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - 0.50 0.50 - - - -
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 0.22 0.22 0.28 0.28 - - - -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ldr w0, [sp]
-# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - add x0, x0, x0
+# CHECK-NEXT: - - - - - - - 0.22 0.22 0.28 0.28 - - - - add x0, x0, x0
# CHECK: Timeline view:
-# CHECK-NEXT: 0
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: Index 012345678
-# CHECK: [0,0] DeeeeER . ldr w0, [sp]
-# CHECK-NEXT: [0,1] D====eER . add x0, x0, x0
-# CHECK-NEXT: [1,0] DeeeeE-R . ldr w0, [sp]
-# CHECK-NEXT: [1,1] D=====eER . add x0, x0, x0
-# CHECK-NEXT: [2,0] DeeeeE--R . ldr w0, [sp]
-# CHECK-NEXT: [2,1] D======eER. add x0, x0, x0
-# CHECK-NEXT: [3,0] D=eeeeE--R. ldr w0, [sp]
-# CHECK-NEXT: [3,1] D=======eER add x0, x0, x0
+# CHECK: [0,0] DeeeeER . ldr w0, [sp]
+# CHECK-NEXT: [0,1] D====eER. add x0, x0, x0
+# CHECK-NEXT: [1,0] DeeeeE-R. ldr w0, [sp]
+# CHECK-NEXT: [1,1] D====eER. add x0, x0, x0
+# CHECK-NEXT: [2,0] DeeeeE-R. ldr w0, [sp]
+# CHECK-NEXT: [2,1] D====eER. add x0, x0, x0
+# CHECK-NEXT: [3,0] D=eeeeER. ldr w0, [sp]
+# CHECK-NEXT: [3,1] D=====eER add x0, x0, x0
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -121,20 +120,20 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr w0, [sp]
-# CHECK-NEXT: 1. 4 6.5 0.0 0.0 add x0, x0, x0
-# CHECK-NEXT: 4 3.9 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 0.5 ldr w0, [sp]
+# CHECK-NEXT: 1. 4 5.3 0.0 0.0 add x0, x0, x0
+# CHECK-NEXT: 4 3.3 0.6 0.3 <total>
# CHECK: [1] Code Region - FPR8-bit
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 15
-# CHECK-NEXT: uOps Per Cycle: 0.96
-# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -176,17 +175,17 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 fadd d0, d0, d0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. ldr b0, [sp]
-# CHECK-NEXT: [0,1] D======eeER .. fadd d0, d0, d0
-# CHECK-NEXT: [1,0] DeeeeeeE--R .. ldr b0, [sp]
-# CHECK-NEXT: [1,1] D========eeER .. fadd d0, d0, d0
-# CHECK-NEXT: [2,0] DeeeeeeE----R .. ldr b0, [sp]
-# CHECK-NEXT: [2,1] D==========eeER.. fadd d0, d0, d0
-# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ldr b0, [sp]
-# CHECK-NEXT: [3,1] D============eeER fadd d0, d0, d0
+# CHECK: [0,0] DeeeeeeER .. ldr b0, [sp]
+# CHECK-NEXT: [0,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ldr b0, [sp]
+# CHECK-NEXT: [1,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr b0, [sp]
+# CHECK-NEXT: [2,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr b0, [sp]
+# CHECK-NEXT: [3,1] D=======eeER fadd d0, d0, d0
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -195,20 +194,20 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ldr b0, [sp]
-# CHECK-NEXT: 1. 4 10.0 0.0 0.0 fadd d0, d0, d0
-# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr b0, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 fadd d0, d0, d0
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
# CHECK: [2] Code Region - FPR16-bit
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 15
-# CHECK-NEXT: uOps Per Cycle: 0.96
-# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -250,17 +249,17 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 fadd d0, d0, d0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. ldr h0, [sp]
-# CHECK-NEXT: [0,1] D======eeER .. fadd d0, d0, d0
-# CHECK-NEXT: [1,0] DeeeeeeE--R .. ldr h0, [sp]
-# CHECK-NEXT: [1,1] D========eeER .. fadd d0, d0, d0
-# CHECK-NEXT: [2,0] DeeeeeeE----R .. ldr h0, [sp]
-# CHECK-NEXT: [2,1] D==========eeER.. fadd d0, d0, d0
-# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ldr h0, [sp]
-# CHECK-NEXT: [3,1] D============eeER fadd d0, d0, d0
+# CHECK: [0,0] DeeeeeeER .. ldr h0, [sp]
+# CHECK-NEXT: [0,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ldr h0, [sp]
+# CHECK-NEXT: [1,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr h0, [sp]
+# CHECK-NEXT: [2,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr h0, [sp]
+# CHECK-NEXT: [3,1] D=======eeER fadd d0, d0, d0
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -269,20 +268,20 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ldr h0, [sp]
-# CHECK-NEXT: 1. 4 10.0 0.0 0.0 fadd d0, d0, d0
-# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr h0, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 fadd d0, d0, d0
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
# CHECK: [3] Code Region - FPR32-bit
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 15
-# CHECK-NEXT: uOps Per Cycle: 0.96
-# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -324,17 +323,17 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 fadd d0, d0, d0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. ldr s0, [sp]
-# CHECK-NEXT: [0,1] D======eeER .. fadd d0, d0, d0
-# CHECK-NEXT: [1,0] DeeeeeeE--R .. ldr s0, [sp]
-# CHECK-NEXT: [1,1] D========eeER .. fadd d0, d0, d0
-# CHECK-NEXT: [2,0] DeeeeeeE----R .. ldr s0, [sp]
-# CHECK-NEXT: [2,1] D==========eeER.. fadd d0, d0, d0
-# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ldr s0, [sp]
-# CHECK-NEXT: [3,1] D============eeER fadd d0, d0, d0
+# CHECK: [0,0] DeeeeeeER .. ldr s0, [sp]
+# CHECK-NEXT: [0,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ldr s0, [sp]
+# CHECK-NEXT: [1,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr s0, [sp]
+# CHECK-NEXT: [2,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr s0, [sp]
+# CHECK-NEXT: [3,1] D=======eeER fadd d0, d0, d0
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -343,20 +342,20 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ldr s0, [sp]
-# CHECK-NEXT: 1. 4 10.0 0.0 0.0 fadd d0, d0, d0
-# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr s0, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 fadd d0, d0, d0
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
# CHECK: [4] Code Region - SIMD64-bit-b
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 15
-# CHECK-NEXT: uOps Per Cycle: 0.96
-# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -398,17 +397,17 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 add v0.16b, v0.16b, v0.16b
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.8b }, [sp]
-# CHECK-NEXT: [0,1] D======eeER .. add v0.16b, v0.16b, v0.16b
-# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.8b }, [sp]
-# CHECK-NEXT: [1,1] D========eeER .. add v0.16b, v0.16b, v0.16b
-# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.8b }, [sp]
-# CHECK-NEXT: [2,1] D==========eeER.. add v0.16b, v0.16b, v0.16b
-# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.8b }, [sp]
-# CHECK-NEXT: [3,1] D============eeER add v0.16b, v0.16b, v0.16b
+# CHECK: [0,0] DeeeeeeER .. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add v0.16b, v0.16b, v0.16b
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -417,20 +416,20 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.8b }, [sp]
-# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add v0.16b, v0.16b, v0.16b
-# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.8b }, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
# CHECK: [5] Code Region - SIMD64-bit-h
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 15
-# CHECK-NEXT: uOps Per Cycle: 0.96
-# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -472,17 +471,17 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 add v0.8h, v0.8h, v0.8h
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.4h }, [sp]
-# CHECK-NEXT: [0,1] D======eeER .. add v0.8h, v0.8h, v0.8h
-# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.4h }, [sp]
-# CHECK-NEXT: [1,1] D========eeER .. add v0.8h, v0.8h, v0.8h
-# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.4h }, [sp]
-# CHECK-NEXT: [2,1] D==========eeER.. add v0.8h, v0.8h, v0.8h
-# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.4h }, [sp]
-# CHECK-NEXT: [3,1] D============eeER add v0.8h, v0.8h, v0.8h
+# CHECK: [0,0] DeeeeeeER .. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add v0.8h, v0.8h, v0.8h
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -491,20 +490,20 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.4h }, [sp]
-# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add v0.8h, v0.8h, v0.8h
-# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.4h }, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
# CHECK: [6] Code Region - SIMD64-bit-s
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 15
-# CHECK-NEXT: uOps Per Cycle: 0.96
-# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -546,17 +545,17 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 add v0.4s, v0.4s, v0.4s
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.2s }, [sp]
-# CHECK-NEXT: [0,1] D======eeER .. add v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.2s }, [sp]
-# CHECK-NEXT: [1,1] D========eeER .. add v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.2s }, [sp]
-# CHECK-NEXT: [2,1] D==========eeER.. add v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.2s }, [sp]
-# CHECK-NEXT: [3,1] D============eeER add v0.4s, v0.4s, v0.4s
+# CHECK: [0,0] DeeeeeeER .. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add v0.4s, v0.4s, v0.4s
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -565,20 +564,20 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.2s }, [sp]
-# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.2s }, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
# CHECK: [7] Code Region - SIMD64-bit-d
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 15
-# CHECK-NEXT: uOps Per Cycle: 0.96
-# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -620,17 +619,17 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 add v0.2d, v0.2d, v0.2d
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.1d }, [sp]
-# CHECK-NEXT: [0,1] D======eeER .. add v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.1d }, [sp]
-# CHECK-NEXT: [1,1] D========eeER .. add v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.1d }, [sp]
-# CHECK-NEXT: [2,1] D==========eeER.. add v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.1d }, [sp]
-# CHECK-NEXT: [3,1] D============eeER add v0.2d, v0.2d, v0.2d
+# CHECK: [0,0] DeeeeeeER .. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add v0.2d, v0.2d, v0.2d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -639,9 +638,9 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.1d }, [sp]
-# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.1d }, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
# CHECK: [8] Code Region - ins
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
index 708f22d54afd3..20c97eeaa8733 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
@@ -45,12 +45,12 @@ add z0.s, z0.s, z0.s
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 0.96
-# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -94,17 +94,17 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. ldr b0, [sp]
-# CHECK-NEXT: [0,1] D======eeER .. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,0] DeeeeeeE--R .. ldr b0, [sp]
-# CHECK-NEXT: [1,1] D========eeER .. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [2,0] DeeeeeeE----R .. ldr b0, [sp]
-# CHECK-NEXT: [2,1] D==========eeER.. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ldr b0, [sp]
-# CHECK-NEXT: [3,1] D============eeER add z0.d, z0.d, z0.d
+# CHECK: [0,0] DeeeeeeER .. ldr b0, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ldr b0, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr b0, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr b0, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -113,20 +113,20 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ldr b0, [sp]
-# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr b0, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
# CHECK: [1] Code Region - FPR16-bit
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 0.96
-# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -170,17 +170,17 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. ldr h0, [sp]
-# CHECK-NEXT: [0,1] D======eeER .. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,0] DeeeeeeE--R .. ldr h0, [sp]
-# CHECK-NEXT: [1,1] D========eeER .. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [2,0] DeeeeeeE----R .. ldr h0, [sp]
-# CHECK-NEXT: [2,1] D==========eeER.. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ldr h0, [sp]
-# CHECK-NEXT: [3,1] D============eeER add z0.d, z0.d, z0.d
+# CHECK: [0,0] DeeeeeeER .. ldr h0, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ldr h0, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr h0, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr h0, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -189,20 +189,20 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ldr h0, [sp]
-# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr h0, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
# CHECK: [2] Code Region - FPR32-bit
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 0.96
-# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -246,17 +246,17 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. ldr s0, [sp]
-# CHECK-NEXT: [0,1] D======eeER .. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,0] DeeeeeeE--R .. ldr s0, [sp]
-# CHECK-NEXT: [1,1] D========eeER .. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [2,0] DeeeeeeE----R .. ldr s0, [sp]
-# CHECK-NEXT: [2,1] D==========eeER.. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ldr s0, [sp]
-# CHECK-NEXT: [3,1] D============eeER add z0.d, z0.d, z0.d
+# CHECK: [0,0] DeeeeeeER .. ldr s0, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ldr s0, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr s0, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr s0, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -265,20 +265,20 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ldr s0, [sp]
-# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr s0, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
# CHECK: [3] Code Region - SIMD64-bit-b
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 0.96
-# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -322,17 +322,17 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.8b }, [sp]
-# CHECK-NEXT: [0,1] D======eeER .. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.8b }, [sp]
-# CHECK-NEXT: [1,1] D========eeER .. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.8b }, [sp]
-# CHECK-NEXT: [2,1] D==========eeER.. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.8b }, [sp]
-# CHECK-NEXT: [3,1] D============eeER add z0.d, z0.d, z0.d
+# CHECK: [0,0] DeeeeeeER .. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -341,20 +341,20 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.8b }, [sp]
-# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.8b }, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
# CHECK: [4] Code Region - SIMD64-bit-h
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 0.96
-# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -398,17 +398,17 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.4h }, [sp]
-# CHECK-NEXT: [0,1] D======eeER .. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.4h }, [sp]
-# CHECK-NEXT: [1,1] D========eeER .. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.4h }, [sp]
-# CHECK-NEXT: [2,1] D==========eeER.. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.4h }, [sp]
-# CHECK-NEXT: [3,1] D============eeER add z0.d, z0.d, z0.d
+# CHECK: [0,0] DeeeeeeER .. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -417,20 +417,20 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.4h }, [sp]
-# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.4h }, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
# CHECK: [5] Code Region - SIMD64-bit-s
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 0.96
-# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -474,17 +474,17 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.2s }, [sp]
-# CHECK-NEXT: [0,1] D======eeER .. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.2s }, [sp]
-# CHECK-NEXT: [1,1] D========eeER .. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.2s }, [sp]
-# CHECK-NEXT: [2,1] D==========eeER.. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.2s }, [sp]
-# CHECK-NEXT: [3,1] D============eeER add z0.d, z0.d, z0.d
+# CHECK: [0,0] DeeeeeeER .. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -493,20 +493,20 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.2s }, [sp]
-# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.2s }, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
# CHECK: [6] Code Region - SIMD64-bit-d
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 0.96
-# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -550,17 +550,17 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.1d }, [sp]
-# CHECK-NEXT: [0,1] D======eeER .. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.1d }, [sp]
-# CHECK-NEXT: [1,1] D========eeER .. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.1d }, [sp]
-# CHECK-NEXT: [2,1] D==========eeER.. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.1d }, [sp]
-# CHECK-NEXT: [3,1] D============eeER add z0.d, z0.d, z0.d
+# CHECK: [0,0] DeeeeeeER .. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -569,9 +569,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.1d }, [sp]
-# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.1d }, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
# CHECK: [7] Code Region - insr
More information about the llvm-commits
mailing list